Download read_dbf.sas read_dbf.sasSubmit a comment

%macro read_dbf (filename, ds, keep_del=0, showflds=0);

 /* Author: Richard A. DeVenezia
  *   Date: 18 May 1993
  *
  * 02/23/96 RAD change date variables to default to mmddyy10.
  *              format and informat;
  * 10/24/96 RAD change method used to skip header portion
  *              (use cut -c -<hdrLen+1>)
  *  4/ 9/98 RAD ensure variable names are valid SAS names
  *  1/ 4/00 RAD file descriptor date of last update year item is
  *              100+ for year 2000+.  Mod with 100 and rely on SAS
  *              YEARCUTOFF option
  * 11/13/00 RAD rewrite data input portion to remove reliance on UNIX cut
  */

 /* Read a DBASE III(+) file into a SAS dataset
  * filename - file in operating system assumed to be in .dbf format
  * ds - destination SAS dataset
  * keep_del - 0 to ignore deleted records, 1 to retain deleted records
  *
  * DBASE III structures
  *
  * File Descriptor:
  * Ofs Len Field
  *  0   1  dBASE III version number (3 ls bits)
  *  1   3  date of last update (YY MM DD in pib1. format)
  *  4   4  number of records (longint - pib4.)
  *  8   2  length of header structure (int - pib2.)
  *         [header + field descriptions ]
  * 10   2  length of each record (int - pib2.) [deleted flag + fields]
  * 12  20  reserved
  *
  * Field Descriptors (32 chars each) immediately follow header
  * (number of fields implied from length of header structure)
  * Ofs Len Field
  *  0  11  field name (variable) zero terminated
  * 11   1  field type (C N F L D M)
  *         C - Character: Ascii
  *         N/F - Numeric: -.0123456789
  *         L - Logical: ?YyTtNnFf
  *         D - Date: yyyymmdd
  *         M - Memo: 10 digit pointer for .dbt file
  * 12   4  field data address (do not care)
  * 16   1  field length (short - pib1.)
  * 17   1  decimals (digits to right of decimal point) (short - pib1.)
  * 18  14  reserved
  *
  * immediately following field descriptors
  * '00'x'0D'x <DATA> - dBASE III
  * '0D'x <DATA> - dBASE III+
  * header length field should take care of skipping over these positions
  *
  * <DATA>
  * all data is stored in character format (i.e. no binary integer or
  * floating point formats), numerics are stored in ascii representation
  */

  %local notes mprint symgen;
  %let notes  = %sysfunc (getoption(NOTES));
  %let mprint = %sysfunc (getoption(MPRINT));
  %let symgen = %sysfunc (getoption(SYMBOLGEN));

  options nonotes nomprint nosymbolgen;

  %local filename ds keep_del showflds;
  %local i type;

  %if (%quote(&filename)=) or (%quote(&ds)=) %then %do;
    %put usage: read_dbf(filename, ds);
    %goto ByeBye;
  %end;

 %*
  * Read File Descriptor and each Field Descriptor
  *;

  %local hdrLen nFields recLen numRex;

  %local maxLenV; %* maximum length of SAS variable name in output dataset;

  %if (&sysver >= 7)
    %then %let maxLenV = 11;    %* dBase field names can be 11 characters at most;
    %else %let maxLenV = 8;     %* SAS variable names can be 8 characters at most;

  data &DS (keep=i name varname type length decimals ) ;
    infile "&filename" unbuffered recfm=n;

    * dBASE file descriptor (32 bytes);
    input
      version  pib1.
      u1       pib1. %* YY - Note: value is 100+ for year 2000+ ;
      u2       pib1. %* MM ;
      u3       pib1. %* DD ;

      %*numRex   pib4. (works only on Intel based SAS);
      rx1      pib1.
      rx2      pib1.
      rx3      pib1.
      rx4      pib1.

      %*hdrLen   pib2.  (works only on Intel based SAS);
      hd1      pib1.
      hd2      pib1.

      %*recLen   pib2.  (works only on Intel based SAS);
      rl1      pib1.
      rl2      pib1.

      reserved $char20.
    ;

    version = mod (version, 8);
    if version ^= 3 then stop;

    numRex = rx1 + 256 * (rx2 + 256 * (rx3 + 256 * rx4));
    hdrLen = hd1 + 256 * (hd2);
    recLen = rl1 + 256 * (rl2);

*   u1 = mod (u1, 100);
*   updated  = mdy (u2, u3, u1);
*   format updated date7.;

    numFlds  = (hdrLen - 33) / 32;
*   fileSize = hdrLen + recLen*numRex + 1;

    call symput ('nFields',trim(left(put(numFlds,8.))));
    call symput ('hdrLen', trim(left(put(hdrLen,8.))));
    call symput ('recLen', trim(left(put(recLen,8.))));
    call symput ('numRex', trim(left(put(numRex,8.))));

    put 'NOTE:' hdrLen= recLen= numFlds= numRex=;

    * read each dBASE field descriptor (32 bytes each);
    * field name will be mapped to a SAS eight character variable name;

    array _var  [2000] $&maxLenV _temporary_;
    length varname $&maxLenV;

    do i = 1 to numFlds;
      input
        name     $CHARZB11.   %* convert null chars to blanks;
        type     $CHAR1.
        reserve1 $CHAR4.
        length   pib1.
        decimals pib1.
        reserve2 $CHAR14.
      ;

      if type='F' then type='N';

      * map first word of field name to variable name;
      * change any non alphanumeric to underscore;

      varname = upcase(name);
      varname = scan (varname, 1, ' ');
      if not ('A' <= substr (varname,1,1) <= 'Z') then
        varname = '_' || varname;

      do j = 1 to length (varname);
        if not ('A' <= substr (varname,j,1) <= 'Z' or
                '0' <= substr (varname,j,1) <= '9')
        then
          substr (varname,j,1) = '_';
      end;

      * check if this SAS varname has been used already;

      found = 0;
      do j = 1 to 2000 while (_var[j] ^= "" and not found);
        found = ( _var[j] = varname );
      end;

      if not found then
        _var[j]=varname;
      else do;
        do k = 1 to 2000 while (found);
          n = left(put(k,4.));
          varname = substr(varname,1,8-length(n)) || n;
          found = 0;
          do j = 1 to 2000 while (_var[j] ^= "" and not found);
            found = ( _var[j] = varname );
          end;
        end;
        _var[j] = varname;
      end;

      output;
    end;

    stop;
  run;

  %if (&syserr ^= 0) %then %goto ByeBye;

 %*
  * localize all macro variables that describe each variable in .dbf file;
  *;

  %do i = 1 %to &nFields;
    %local
      var&i
      label&i
      type&i
      len&i
      dec&i
    ;
  %end;

 %*
  * now populate those macro variables with values
  *;

  data _null_;
    set &ds;

    n = left(put(i,8.));
    call symput ('var'  ||n, trim(varname));
    call symput ('label'||n, trim(name));
    call symput ('type' ||n, trim(type));
    call symput ('len'  ||n, trim(left(put(length,8.))));
    call symput ('dec'  ||n, trim(left(put(decimals,8.))));
  run;

  %if (&syserr ^= 0) %then %goto ByeBye;

  %if (&showflds) %then %do;
    %* report file structure;
    %put &filename;
    %put hdrLen=&hdrLen recLen=&recLen numRex=&numRex;
    %do i=1 %to &nfields;
      %put %substr(&i..%str(   ),1,3) &&var&i &&label&i &&type&i &&len&i &&dec&i;
    %end;
  %end;

 %*
  * read data part of .dbf file
  *;

  %local type;

  data &ds;
    infile "&filename" unbuffered recfm=n;

    attrib deleted length=$1;

    %do i=1 %to &nFields;
      %let type=&&type&i;

      ATTRIB &&var&i label="&&label&i"
      %if (&type=D) %then length=4 format=mmddyy10. informat=mmddyy10.; %else
      %if (&type=L) %then length=3 format=1.    ;      %else
      %if (&type=C) %then length=$&&len&i;             %else
      %if (&type=N) %then format=&&len&i...&&dec&i;
      ;
    %end;

    attrib source length=$150 label='Data source';
    retain source "&filename";

    input @&hdrLen deleted $CHAR1.;

    do i = 1 to &numRex;
      input
        deleted  $CHAR1.
      ;

      %if (not &keep_del) %then %do;
        if deleted ne ' ' then do;
         %* since this record is tagged as deleted in the .dbf and the macro
          * was invoked indicating deleted records are not to be saved,
          * skip over the rest of the record;
          input +(&recLen-2) dummy $CHAR1.;
          drop dummy;
          goto NextRcrd;
        end;
      %end;

      input
        %do i=1 %to &nFields;
          %let type=&&type&i;
          %if (&type=D) %then &&var&i yymmdd8. ;         %else
          %if (&type=L) %then l&i $CHAR1. ;              %else
          %if (&type=C) %then &&var&i $CHAR&&len&i...;   %else
          %if (&type=N) %then &&var&i &&len&i...&&dec&i; %else
          %if (&type=M) %then &&var&i $CHAR&&len&i...;
      %end;
     ;

    %do i=1 %to &nfields;
      %let type=&&type&i;
      %if (&type=L) %then %do;
        if (upcase(l&i) in ("T" "Y")) then &&var&i=1; else
        if (upcase(l&i) in ("F" "N")) then &&var&i=0;
        drop l&i;
      %end;
      %else
      %if (&type=M) %then %do;
        drop &&var&i;
      %end;
    %end;

NextRcrd:
      %if (&keep_del) %then %do;
        output;
      %end;
      %else %do;
        if deleted = ' ' then output;
      %end;
    end;

    stop;

    drop deleted;
  run;

%ByeBye:

  options &symgen &mprint &notes;

%mend read_dbf;