/* Richard A. DeVenezia * www.devenezia.com * 3/18/2004 * * This material was originally posted to SAS-L on March 13, 2004 * in thread "Super Size Dataset from Cartesian Join" * * The original problem was posted by David Hou who was using SQL to * calculate sphere distances between every pair of points in a * population of 60,000 points, keeping only the point pairs that * are 1 mile apart. * * The problem rapidly degenerated into an exercise in optimizing * array processing in data step. * * Mike Zdeb offered some solutions going back to real world data, * and Howard Schreier demonstrated an exquisite reformulation of * the problem and SQL selection that could be optimized by the SQL planner. */ /* * Generate N points */ %let n = 60000; %let n = 600; %let n = 6000; %let n = 15000; data points; do id = 1 to &n; x = ranuni(1); y = ranuni(1); output; end; run; proc sql noprint; select count(*) into :n from points; quit; %let t0 = %sysfunc(datetime()); %let maxdistance = 0.05; %let maxdistance_squared = %sysevalf (&maxdistance**2); %let report_every_pct = 20; %let report_every = %sysevalf(%sysfunc(comb(&n,2))/100*&report_every_pct); /* * Compute distance between each point and keep the pairs * that are close. * * Note: nc = comp(n,2) = n*(n-1)/2 distances will need to be computed * If n=60,000 then nc=1,799,970,000 */ data close; array _x[&n] _temporary_; array _y[&n] _temporary_; do until (eopoints); set points end=eopoints; _x[_n_] = x; _y[_n_] = y; _n_ + 1; end; do i = 1 to %eval(&n-1); _xi = _x[i]; _yi = _y[i]; _xlower = _xi - &maxdistance; _xupper = _xi + &maxdistance; _ylower = _yi - &maxdistance; _yupper = _yi + &maxdistance; do j = i+1 to &n; _xj = _x[j]; _yj = _y[j]; pairCount + 1; reportCount + 1; if reportCount >= &report_every then do; reportCount = 0; pctDone = pairCount / %sysevalf( (&n-1) * &n/2 * 1/100); pctOut = outputCount / pairCount * 100; walltime = datetime()-&t0; put walltime=time8. outputCount=comma15. pctOut=4.1 pairCount=comma15. pctDone=4.1; end; * less taxing tests; * if abs(_x[i] - _x[j]) > &maxdistance then continue; * if abs(_y[i] - _y[j]) > &maxdistance then continue; if _xj > _xupper | _xj < _xlower then continue; if _yj > _yupper | _yj < _ylower then continue; distance_squared = (_xi-_xj)**2 + (_yi-_yj)**2 ; if distance_squared <= &maxdistance_squared then do; outputCount + 1; output; end; end; end; put outputCount=comma12. pairCount=comma12.; stop; keep i j distance_squared; run; /* * Here is Howard Schreiers reformulation * - perform a geometrical transform so that all points lie * on the vertices of an integer lattice * * Geometrically the transformation changes the truth cases of the * conditional clause * A.x between B.x - &d and B.x + &d and * A.y between B.y - &d and B.y + &d * -into- * being a truth case of one of the following * A.xgrid - 1 = B.xgrid and A.ygrid - 1 = B.xgrid * A.xgrid - 1 = B.xgrid and A.ygrid = B.xgrid * A.xgrid - 1 = B.xgrid and A.ygrid + 1 = B.xgrid * A.xgrid = B.xgrid and A.ygrid - 1 = B.xgrid * A.xgrid = B.xgrid and A.ygrid 1 = B.xgrid * A.xgrid = B.xgrid and A.ygrid + 1 = B.xgrid * A.xgrid + 1 = B.xgrid and A.ygrid - 1 = B.xgrid * A.xgrid + 1 = B.xgrid and A.ygrid 1 = B.xgrid * A.xgrid + 1 = B.xgrid and A.ygrid + 1 = B.xgrid * The 9 cases are enumerable by a join of * A, (-1,0,1), (-1,0,1) */ /* * Step 1 * Transform the points to lie on integer vertices (lattice points). * Unit measure corresponds to maxdistance */ data gridded; set points; xgrid = round(round(x,&maxdistance)/&maxdistance); ygrid = round(round(y,&maxdistance)/&maxdistance); run; proc sql noprint; select count(*) into :ngrid from (select distinct xgrid,ygrid from gridded); quit; %put n points = &n; %put n grid points = &ngrid; /* * Step 2 * Set up a simple adjacency table that will be cartesian joined * to points in a sub select */ data offsets; do xoffset = -1 to 1; do yoffset = -1 to 1; output; end; end; run; /* * The grid transformation ensures points that are * within maxdistance of each other will be on * adjacent lattice points */ proc sql _method; create table sqlfast as select distinct A.id as i, B.id as j, (A.x-B.x)**2 + (A.y-B.y)**2 as distance_squared from (select * from gridded, offsets) as A , gridded as B where A.id < B.id and calculated distance_squared <= &maxdistance_squared and A.xgrid + xoffset = B.xgrid and A.ygrid + yoffset = B.ygrid order by i, j; quit; %put sqloops = &sqloops; /* * Richards tweak of sqlfast. Subselect A performs * three way join using minimal set of offsets */ data offsets; do offset = -1 to 1; output; end; run; proc sql _method; create table sqlfast2 as select distinct A.id as i, B.id as j, (A.x-B.x)**2 + (A.y-B.y)**2 as distance_squared from (select gridded.* , x.offset as xoffset , y.offset as yoffset from gridded , offsets as x , offsets as y ) as A , gridded as B where A.id < B.id and calculated distance_squared <= &maxdistance_squared and A.xgrid + A.xoffset = B.xgrid and A.ygrid + A.yoffset = B.ygrid order by i, j; quit; %put sqloops = &sqloops;