tools.csv_to_snapshot: Limit hashing to uint64

Summary:
This prevents Python's integer precision going out of hand and causing
major slow downs.

Reviewers: mferencevic, buda

Reviewed By: mferencevic

Differential Revision: https://phabricator.memgraph.io/D677
This commit is contained in:
Teon Banek 2017-08-18 11:26:49 +02:00
parent 3a365a2808
commit 22ab0e7553

View File

@ -71,11 +71,12 @@ class Hasher:
.format(type(data).__name__)) .format(type(data).__name__))
for byte in data: for byte in data:
self._hash = self._hash * self._PRIME + byte + 1 self._hash = self._hash * self._PRIME + byte + 1
self._hash %= 2**64 # Make hash fit in uint64_t
def digest(self): def digest(self):
'''Return the digest value as an int (which fits in uint64_t) and '''Return the digest value as an int (which fits in uint64_t) and
*not* as bytes. (This is different from hashlib objects.)''' *not* as bytes. (This is different from hashlib objects.)'''
return self._hash % (2**64) return self._hash
class BoltEncoder: class BoltEncoder:
@ -129,8 +130,9 @@ class BoltEncoder:
def write_str(self, value): def write_str(self, value):
self._write(self._STRING32_MARKER) self._write(self._STRING32_MARKER)
self._write(self._UINT32_STRUCT.pack(len(value))) data = value.encode('utf-8')
self._write(value.encode('utf-8')) self._write(self._UINT32_STRUCT.pack(len(data)))
self._write(data)
def write_list(self, values): def write_list(self, values):
self._write(self._LIST32_MARKER) self._write(self._LIST32_MARKER)
@ -264,7 +266,7 @@ def write_node_row(node_row, array_delimiter, encoder):
def convert_nodes(node_filenames, csv_delimiter, array_delimiter, encoder): def convert_nodes(node_filenames, csv_delimiter, array_delimiter, encoder):
node_count = 0 node_count = 0
for node_filename in node_filenames: for node_filename in node_filenames:
with open(node_filename) as node_file: with open(node_filename, newline='', encoding='utf-8') as node_file:
nodes = csv.DictReader(node_file, delimiter=csv_delimiter) nodes = csv.DictReader(node_file, delimiter=csv_delimiter)
for node in nodes: for node in nodes:
write_node_row(node, array_delimiter, encoder) write_node_row(node, array_delimiter, encoder)
@ -307,7 +309,8 @@ def convert_relationships(relationship_filenames, csv_delimiter,
array_delimiter, encoder): array_delimiter, encoder):
relationship_count = 0 relationship_count = 0
for relationship_filename in relationship_filenames: for relationship_filename in relationship_filenames:
with open(relationship_filename) as relationship_file: with open(relationship_filename, newline='', encoding='utf-8') as \
relationship_file:
relationships = csv.DictReader(relationship_file, relationships = csv.DictReader(relationship_file,
delimiter=csv_delimiter) delimiter=csv_delimiter)
for relationship in relationships: for relationship in relationships: