/* * Richard A. DeVenezia * August 21, 2004 * * Similar content posted: * Newsgroups: comp.soft-sys.sas * Local: Thurs, Aug 19 2004 10:13 am * Subject: Re: group by first name or last name Art wrote: > How to group people by their first name OR last name, > 1 John Smith > 2 George Smith > 3 Bill Clinton > 4 George Bush > I need to put 1,2,4 into to one group. Is there any way I can do this > in SAS or SQL? Art: This is an interesting question. General statement of problem: ----------------------------- Given: P = p{i} = (p{i,1),p{i,2}), a set of pairs (key1, key2). Find: The distinct groups, G = g{x}, of P, such that each pair p in a group g has this property: key1 matches key1 of any other pair in g. -or- key2 matches key2 of any other pair in g. ----------------------------- Consideration 1: ----------------------------- Straight up SQL, if possible, would no doubt require a full outer join and likely require multiple passes. ----------------------------- Consideration 2: ----------------------------- The following is an iterative way using version 9 hashes. Two hashes maintain the groupId assigned to each key value. Two additional hashes are used to maintain group mapping paths. When the data can be passed without causing a mapping, then the groups have been fully determined. A final pass is done, at which point the groupIds are assigned to each pair and the data is output to a table. ----------------------------- [ I haven't taken the time to prove if this algorithm is correct (or defeatable) for all data sequences. Since there is no proof, one must assume there is a possibility ever so slight that some data sequence could defeat the algorithm (i.e. the data truly indicates N groups and the algorithm finds >N groups) ] */ options ls=130; %let seed = %sysfunc(mod(%sysfunc(compress(%sysfunc(constant(e)),.)),2**31)); data pairs; n = 20; do id = 1 to n; key1 = int (n*ranuni(&seed)); key2 = int (n*ranuni(&seed)); output; end; stop; drop n; run; /* data pairs; id + 1; input key1 $ key2 $; cards; John Smith George Smith Bill Clinton George Bush run; data pairs; id + 1; input key1 key2 ; format _numeric_ 4.; cards; 1 2 3 2 4 5 3 6 4 2 run; */ %let dbg = *; %* disable debugging puts; %let dbg = ; %* enable debugging puts; data pairsWithGroupAssignments ; declare hash one(); one.definekey ('key1'); one.definedata ('key1', 'groupid'); one.definedone(); declare hash two(); two.definekey ('key2'); two.definedata ('key2', 'groupid'); two.definedone(); declare hash map1(); map1.definekey ('from'); map1.definedata ('from', 'to'); map1.definedone(); declare hash map2(); map2.definekey ('from'); map2.definedata ('from', 'to'); map2.definedone(); _groupId = 0; noMappings = 0; nPass = 0; do until (noMappings and outputDone); doOutput = noMappings; noMappings = 1; * loop through the data set and * assign a tentative group id to each pair; nPass + 1; do _n_ = 1 to numberOfPairs; set pairs nobs=numberOfPairs point=_n_; * retrieve groupId previously assigned to each item in the pair; rc1 = one.find(); g1 = groupId; rc2 = two.find(); g2 = groupId; * if this is the final pass, then outputDone flag will be set; if doOutput then do; * output pair data with final groupId and * continue to next observation; output; continue; end; &dbg. put id= '(' key1 +(-1) ', ' key2 +(-1) ') ' @; if rc1 ne 0 and rc2 ne 0 then do; /** / addboth: * neither item of the pair had been previously assigned a groupId * compute a new groupId and record it as assigned to each item of the pair /**/ _groupId + 1; groupId = _groupId; one.add (); two.add (); &dbg. put 'add ' key1= 'and ' key2= 'to ' groupId=; end; else if rc1 ne 0 and rc2 = 0 then do; /** / add1: * item1 of the pair has not been previously assigned a groupId * record it as assigned to the group item2 is assigned to /**/ groupId = g2; one.add(); &dbg. put 'add ' key1= 'to ' groupId=; end; else if rc1 = 0 and rc2 ne 0 then do; /** / add2: * item2 of the pair has not been previously assigned a groupId * record it as assigned to the group item1 is assigned to /**/ groupId = g1; two.add(); &dbg. put 'add ' key2= 'to ' groupId=; end; else if g1 > g2 then do; /** / g1g2: * the first row in which item2 was assigned a groupId occurred before * the first row in which item1 was assigned a groupId. * * since two items are in the same pair, the groupId g1 must be * combined with groupId g2. This is in accordance with the group * construction criteria /**/ from = g1; to = g2; * determine lowest g2 groupid, by following map1; _to = to; do while (map1.find(key:_to) = 0); _to = to; end; * record the relation between g1 and the lowest g2; from = g1; map1.replace(); * record item1 as assigned to lowest g2 group; groupId = to; one.replace(); &dbg. put 'add ' key1= 'to ' groupId= 'mapped from key1 group ' from; noMappings = 0; end; else if g2 > g1 then do; /** / g2g1: * the first row in which item1 was assigned a groupId occurred before * the first row in which item2 was assigned a groupId. * * since two items are in the same pair, the groupId g2 must be * combined with groupId g1. This is in accordance with the group * construction criteria /**/ from = g2; to = g1; * determine lowest g1 groupid by following map2; to_ = to; do while (map2.find(key:to_) = 0); to_ = to; end; * record the relation between g2 and the lowest g1; from = g2; map2.replace(); * record item2 as assigned to lowest g1 group; groupId = to; two.replace(); &dbg. put 'add ' key2= 'to ' groupId= 'mapped from key2 group ' from; noMappings = 0; end; else do; /** / same: /**/ &dbg. put rc1= rc2= g1= g2=; end; end; &dbg. put; outputDone = doOutput; end; put 'NOTE: Data iterated ' npass 'times.'; /** / two.output(dataset:'g2'); one.output(dataset:'g1'); map1.output(dataset:'map1'); map2.output(dataset:'map2'); /**/ stop; keep id key1 key2 groupId; format _numeric_ 8.; run; proc sql noprint; select count(distinct groupId) into :ngroups from &syslast; create view counts as select groupid, count(*) as n format=4. from &syslast group by groupId order by n descending ; quit; %put ngroups=&ngroups;