1
0
mirror of https://github.com/pruzko/hakuin synced 2024-11-08 13:59:15 +01:00

float extraction (text-based binary search)

This commit is contained in:
Jakub Pruzinec 2023-12-07 17:17:06 +01:00
parent 34e11cf2ee
commit 5983e5fb42
3 changed files with 34 additions and 10 deletions

@ -2,8 +2,6 @@ import hakuin
import hakuin.search_algorithms as alg
import hakuin.collectors as coll
from hakuin.utils import CHARSET_DIGITS
class Extractor:
@ -200,12 +198,10 @@ class Extractor:
list: list of floats in the column
'''
ctx = coll.Context(table=table, column=column)
res = coll.BinaryTextCollector(
return coll.FloatCollector(
requester=self.requester,
dbms=self.dbms,
charset=CHARSET_DIGITS,
).run(ctx)
return [float(v) if v is not None else None for v in res]
def extract_column_bytes(self, table, column):

@ -3,7 +3,7 @@ from abc import ABCMeta, abstractmethod
from collections import Counter
import hakuin
from hakuin.utils import tokenize, EOS, ASCII_MAX, UNICODE_MAX, BYTE_MAX
from hakuin.utils import tokenize, EOS, ASCII_MAX, UNICODE_MAX, BYTE_MAX, CHARSET_DIGITS
from hakuin.utils.huffman import make_tree
from hakuin.search_algorithms import BinarySearch, TreeSearch, NumericBinarySearch
@ -116,6 +116,7 @@ class Collector(metaclass=ABCMeta):
return self.requester.request(ctx, query)
class IntCollector(Collector):
'''Collector for integer columns'''
def collect_row(self, ctx):
@ -129,12 +130,35 @@ class IntCollector(Collector):
).run(ctx)
class FloatCollector(Collector):
'''Collector for integer columns'''
def collect_row(self, ctx):
ctx.s = ''
while True:
c = self.collect_one(ctx)
if c == EOS:
return ctx.s
ctx.s += c
return float(ctx.s)
def collect_one(self, ctx):
return BinarySearch(
requester=self.requester,
query_cb=self.dbms.q_float_char_in_set,
values=CHARSET_DIGITS,
).run(ctx)
class BytesCollector(Collector):
'''Collector for bytes columns'''
def collect_row(self, ctx):
ctx.s = b''
while True:
b = self.collect_byte(ctx)
b = self.collect_one(ctx)
if b == EOS:
return ctx.s
ctx.s += b
@ -142,7 +166,7 @@ class BytesCollector(Collector):
return ctx.s
def collect_byte(self, ctx):
def collect_one(self, ctx):
res = NumericBinarySearch(
requester=self.requester,
query_cb=self.dbms.q_byte_lt,
@ -154,6 +178,7 @@ class BytesCollector(Collector):
return EOS if res == BYTE_MAX + 1 else res.to_bytes(1, 'big')
class TextCollector(Collector):
'''Collector for text columns.'''
def __init__(self, requester, dbms, charset=None):

@ -114,8 +114,8 @@ class DBMS(metaclass=ABCMeta):
query = self.jj.get_template('char_in_set.jinja').render(ctx=ctx, values=values, has_eos=has_eos)
return self.normalize(query)
def q_char_lt(self, ctx, n, has_eos):
query = self.jj.get_template('char_lt.jinja').render(ctx=ctx, n=n, has_eos=has_eos)
def q_char_lt(self, ctx, n):
query = self.jj.get_template('char_lt.jinja').render(ctx=ctx, n=n)
return self.normalize(query)
def q_string_in_set(self, ctx, values):
@ -126,6 +126,9 @@ class DBMS(metaclass=ABCMeta):
query = self.jj.get_template('int_lt.jinja').render(ctx=ctx, n=n)
return self.normalize(query)
def q_float_char_in_set(self, ctx, values):
return self.q_char_in_set(ctx, values)
def q_byte_lt(self, ctx, n):
query = self.jj.get_template('byte_lt.jinja').render(ctx=ctx, n=n)
return self.normalize(query)