Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions src/pandas_pattern_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import numpy as np

class PandasPatternGenerator():
def __init__(self,n,k):
self.n = n
self.k = k
def __init__(self,n,k,hll_bits):
self.n = n # number of rows in dataset
self.k = k # number of pattners created
self.hll_bits = hll_bits # hll vectors will have 2**hll_bits many dimensions
self.pattern_index = range(1,self.k+1)
self.patterns = {i: self.generate_pattern(n,i) for i in self.pattern_index}
self.overlaps = pd.DataFrame(
Expand All @@ -14,6 +15,7 @@ def __init__(self,n,k):
[self.get_overlap(i,j) if i!=j else n//i for i in self.pattern_index] for j in self.pattern_index
]
)
self.hll_patterns = {k: self.compute_hll_array(v,self.hll_bits) for k,v in self.patterns.items()}

def generate_pattern(self,n,i):
return pd.Series(range(0,n,i))
Expand All @@ -22,3 +24,38 @@ def get_overlap(self,i,j):
pat_1 = self.patterns[i]
pat_2 = self.patterns[j]
return len(pat_1.to_frame().merge(pat_2.to_frame(),on=0))

def compute_first_bit(self,a):
# shamelessly coppied from: https://github.com/dask/dask/blob/main/dask/dataframe/hyperloglog.py
"Compute the position of the first nonzero bit for each int in an array."
# TODO: consider making this less memory-hungry
bits = np.bitwise_and.outer(a, 1 << np.arange(32))
bits = bits.cumsum(axis=1).astype(bool)
return 33 - bits.sum(axis=1)

def compute_hll_array(self, obj, b):
# shamelessly coppied from: https://github.com/dask/dask/blob/main/dask/dataframe/hyperloglog.py
# b is the number of bits

if not 8 <= b <= 16:
raise ValueError("b should be between 8 and 16")
num_bits_discarded = 32 - b
m = 1 << b

# Get an array of the hashes
hashes = hash_pandas_object(obj, index=False)
if isinstance(hashes, pd.Series):
hashes = hashes._values
hashes = hashes.astype(np.uint32)

# Of the first b bits, which is the first nonzero?
j = hashes >> num_bits_discarded
first_bit = self.compute_first_bit(hashes)

# Pandas can do the max aggregation
df = pd.DataFrame({"j": j, "first_bit": first_bit})
series = df.groupby("j").max()["first_bit"]

# Return a dense array so we can concat them and get a result
# that is easy to deal with
return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8)