From b9fa188a7d385eb2dac901d95c3d46184222f29c Mon Sep 17 00:00:00 2001 From: bryce Date: Thu, 19 Aug 2021 00:17:17 -0600 Subject: [PATCH 1/2] added hll conversion to patterns --- src/pandas_pattern_generator.py | 37 ++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/pandas_pattern_generator.py b/src/pandas_pattern_generator.py index 812a672..ea24e76 100644 --- a/src/pandas_pattern_generator.py +++ b/src/pandas_pattern_generator.py @@ -2,9 +2,10 @@ import numpy as np class PandasPatternGenerator(): - def __init__(self,n,k): + def __init__(self,n,k,hll_bits): self.n = n self.k = k + self.hll_bits = hll_bits self.pattern_index = range(1,self.k+1) self.patterns = {i: self.generate_pattern(n,i) for i in self.pattern_index} self.overlaps = pd.DataFrame( @@ -14,6 +15,7 @@ def __init__(self,n,k): [self.get_overlap(i,j) if i!=j else n//i for i in self.pattern_index] for j in self.pattern_index ] ) + self.hll_patterns = {k: self.compute_hll_array(v,self.hll_bits) for k,v in self.patterns.items()} def generate_pattern(self,n,i): return pd.Series(range(0,n,i)) @@ -22,3 +24,36 @@ def get_overlap(self,i,j): pat_1 = self.patterns[i] pat_2 = self.patterns[j] return len(pat_1.to_frame().merge(pat_2.to_frame(),on=0)) + + def compute_first_bit(self,a): + "Compute the position of the first nonzero bit for each int in an array." + # TODO: consider making this less memory-hungry + bits = np.bitwise_and.outer(a, 1 << np.arange(32)) + bits = bits.cumsum(axis=1).astype(bool) + return 33 - bits.sum(axis=1) + + def compute_hll_array(self, obj, b): + # b is the number of bits + + if not 8 <= b <= 16: + raise ValueError("b should be between 8 and 16") + num_bits_discarded = 32 - b + m = 1 << b + + # Get an array of the hashes + hashes = hash_pandas_object(obj, index=False) + if isinstance(hashes, pd.Series): + hashes = hashes._values + hashes = hashes.astype(np.uint32) + + # Of the first b bits, which is the first nonzero? + j = hashes >> num_bits_discarded + first_bit = self.compute_first_bit(hashes) + + # Pandas can do the max aggregation + df = pd.DataFrame({"j": j, "first_bit": first_bit}) + series = df.groupby("j").max()["first_bit"] + + # Return a dense array so we can concat them and get a result + # that is easy to deal with + return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8) From f47ffa85a88651f81dd378f51133055dcf5dc4e0 Mon Sep 17 00:00:00 2001 From: bryce Date: Thu, 19 Aug 2021 00:23:36 -0600 Subject: [PATCH 2/2] added a few comments --- src/pandas_pattern_generator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pandas_pattern_generator.py b/src/pandas_pattern_generator.py index ea24e76..891650b 100644 --- a/src/pandas_pattern_generator.py +++ b/src/pandas_pattern_generator.py @@ -3,9 +3,9 @@ class PandasPatternGenerator(): def __init__(self,n,k,hll_bits): - self.n = n - self.k = k - self.hll_bits = hll_bits + self.n = n # number of rows in dataset + self.k = k # number of pattners created + self.hll_bits = hll_bits # hll vectors will have 2**hll_bits many dimensions self.pattern_index = range(1,self.k+1) self.patterns = {i: self.generate_pattern(n,i) for i in self.pattern_index} self.overlaps = pd.DataFrame( @@ -26,6 +26,7 @@ def get_overlap(self,i,j): return len(pat_1.to_frame().merge(pat_2.to_frame(),on=0)) def compute_first_bit(self,a): + # shamelessly coppied from: https://github.com/dask/dask/blob/main/dask/dataframe/hyperloglog.py "Compute the position of the first nonzero bit for each int in an array." # TODO: consider making this less memory-hungry bits = np.bitwise_and.outer(a, 1 << np.arange(32)) @@ -33,6 +34,7 @@ def compute_first_bit(self,a): return 33 - bits.sum(axis=1) def compute_hll_array(self, obj, b): + # shamelessly coppied from: https://github.com/dask/dask/blob/main/dask/dataframe/hyperloglog.py # b is the number of bits if not 8 <= b <= 16: