Wednesday, May 29, 2013

Bloom Filter and Distributed Map


insert overwrite local directory 'delisted_user_${networkAbbr}'
select
  ks_uid , delist_date
from
  delist_user
where dt=${dateString}
...
4:45 PM
 (select  *
    from collect_moment_contrib_view_${networkAbbr}
    where ! bloom_contains(concat( cast(ks_uid as string), "_", content_id),
           distributed_bloom( 'dup_moment_bloom_${networkAbbr}'))
      and ! bloom_contains( cast(ks_uid as string),
           distributed_bloom( 'optout_bloom_${networkAbbr}'))
      and distributed_map( ks_uid, "delisted_user_${networkAbbr}" ) is null
4:46 PM
insert overwrite local directory 'dup_moment_bloom_${networkAbbr}'
select bloom( concat(cast(ks_uid as string), "_", content_id) )
 from duplicate_moments
    where dt = ${dateString}
      and network_abbr = "${networkAbbr}"
      and label = "DUPLICATE"
;
add file dup_moment_bloom_${networkAbbr};

No comments:

Post a Comment