/* Richard A. DeVenezia
* www.devenezia.com
* 3/18/2004
*
* This material was originally posted to SAS-L on March 13, 2004
* in thread "Super Size Dataset from Cartesian Join"
*
* The original problem was posted by David Hou who was using SQL to
* calculate sphere distances between every pair of points in a
* population of 60,000 points, keeping only the point pairs that
* are 1 mile apart.
*
* The problem rapidly degenerated into an exercise in optimizing
* array processing in data step.
*
* Mike Zdeb offered some solutions going back to real world data,
* and Howard Schreier demonstrated an exquisite reformulation of
* the problem and SQL selection that could be optimized by the SQL planner.
*/
/*
* Generate N points
*/
%let n = 60000;
%let n = 600;
%let n = 6000;
%let n = 15000;
data points;
do id = 1 to &n;
x = ranuni(1);
y = ranuni(1);
output;
end;
run;
proc sql noprint;
select count(*) into :n from points;
quit;
%let t0 = %sysfunc(datetime());
%let maxdistance = 0.05;
%let maxdistance_squared = %sysevalf (&maxdistance**2);
%let report_every_pct = 20;
%let report_every = %sysevalf(%sysfunc(comb(&n,2))/100*&report_every_pct);
/*
* Compute distance between each point and keep the pairs
* that are close.
*
* Note: nc = comp(n,2) = n*(n-1)/2 distances will need to be computed
* If n=60,000 then nc=1,799,970,000
*/
data close;
array _x[&n] _temporary_;
array _y[&n] _temporary_;
do until (eopoints);
set points end=eopoints;
_x[_n_] = x;
_y[_n_] = y;
_n_ + 1;
end;
do i = 1 to %eval(&n-1);
_xi = _x[i];
_yi = _y[i];
_xlower = _xi - &maxdistance;
_xupper = _xi + &maxdistance;
_ylower = _yi - &maxdistance;
_yupper = _yi + &maxdistance;
do j = i+1 to &n;
_xj = _x[j];
_yj = _y[j];
pairCount + 1;
reportCount + 1;
if reportCount >= &report_every then do;
reportCount = 0;
pctDone = pairCount / %sysevalf( (&n-1) * &n/2 * 1/100);
pctOut = outputCount / pairCount * 100;
walltime = datetime()-&t0;
put walltime=time8. outputCount=comma15. pctOut=4.1 pairCount=comma15. pctDone=4.1;
end;
* less taxing tests;
* if abs(_x[i] - _x[j]) > &maxdistance then continue;
* if abs(_y[i] - _y[j]) > &maxdistance then continue;
if _xj > _xupper | _xj < _xlower then continue;
if _yj > _yupper | _yj < _ylower then continue;
distance_squared = (_xi-_xj)**2 + (_yi-_yj)**2 ;
if distance_squared <= &maxdistance_squared then do;
outputCount + 1;
output;
end;
end;
end;
put outputCount=comma12. pairCount=comma12.;
stop;
keep i j distance_squared;
run;
/*
* Here is Howard Schreiers reformulation
* - perform a geometrical transform so that all points lie
* on the vertices of an integer lattice
*
* Geometrically the transformation changes the truth cases of the
* conditional clause
* A.x between B.x - &d and B.x + &d and
* A.y between B.y - &d and B.y + &d
* -into-
* being a truth case of one of the following
* A.xgrid - 1 = B.xgrid and A.ygrid - 1 = B.xgrid
* A.xgrid - 1 = B.xgrid and A.ygrid = B.xgrid
* A.xgrid - 1 = B.xgrid and A.ygrid + 1 = B.xgrid
* A.xgrid = B.xgrid and A.ygrid - 1 = B.xgrid
* A.xgrid = B.xgrid and A.ygrid 1 = B.xgrid
* A.xgrid = B.xgrid and A.ygrid + 1 = B.xgrid
* A.xgrid + 1 = B.xgrid and A.ygrid - 1 = B.xgrid
* A.xgrid + 1 = B.xgrid and A.ygrid 1 = B.xgrid
* A.xgrid + 1 = B.xgrid and A.ygrid + 1 = B.xgrid
* The 9 cases are enumerable by a join of
* A, (-1,0,1), (-1,0,1)
*/
/*
* Step 1
* Transform the points to lie on integer vertices (lattice points).
* Unit measure corresponds to maxdistance
*/
data gridded;
set points;
xgrid = round(round(x,&maxdistance)/&maxdistance);
ygrid = round(round(y,&maxdistance)/&maxdistance);
run;
proc sql noprint;
select count(*) into :ngrid from
(select distinct xgrid,ygrid from gridded);
quit;
%put n points = &n;
%put n grid points = &ngrid;
/*
* Step 2
* Set up a simple adjacency table that will be cartesian joined
* to points in a sub select
*/
data offsets;
do xoffset = -1 to 1;
do yoffset = -1 to 1;
output;
end;
end;
run;
/*
* The grid transformation ensures points that are
* within maxdistance of each other will be on
* adjacent lattice points
*/
proc sql _method;
create table sqlfast as
select distinct
A.id as i,
B.id as j,
(A.x-B.x)**2 + (A.y-B.y)**2 as distance_squared
from (select * from gridded, offsets) as A
, gridded as B
where A.id < B.id and
calculated distance_squared <= &maxdistance_squared and
A.xgrid + xoffset = B.xgrid and
A.ygrid + yoffset = B.ygrid
order by i, j;
quit;
%put sqloops = &sqloops;
/*
* Richards tweak of sqlfast. Subselect A performs
* three way join using minimal set of offsets
*/
data offsets;
do offset = -1 to 1;
output;
end;
run;
proc sql _method;
create table sqlfast2 as
select distinct
A.id as i,
B.id as j,
(A.x-B.x)**2 + (A.y-B.y)**2 as distance_squared
from (select gridded.*
, x.offset as xoffset
, y.offset as yoffset
from gridded
, offsets as x
, offsets as y
) as A
, gridded as B
where A.id < B.id and
calculated distance_squared <= &maxdistance_squared and
A.xgrid + A.xoffset = B.xgrid and
A.ygrid + A.yoffset = B.ygrid
order by i, j;
quit;
%put sqloops = &sqloops;