Skip to content

Commit fca74f1

Browse files
Duplicate node tolerance (#7)
Allow skipping of duplicate node IDs
1 parent d88c20e commit fca74f1

File tree

1 file changed

+9
-5
lines changed

1 file changed

+9
-5
lines changed

bulk_insert.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class Type:
2626

2727
# User-configurable thresholds for when to send queries to Redis
2828
class Configs(object):
29-
def __init__(self, max_token_count, max_buffer_size, max_token_size):
29+
def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes):
3030
# Maximum number of tokens per query
3131
# 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
3232
# that we can safely ignore tokens that aren't binary strings
@@ -38,6 +38,8 @@ def __init__(self, max_token_count, max_buffer_size, max_token_size):
3838
# 512 megabytes is a hard-coded Redis maximum
3939
self.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
4040

41+
self.skip_invalid_nodes = skip_invalid_nodes
42+
4143
# QueryBuffer is the class that processes input CSVs and emits their binary formats to the Redis client.
4244
class QueryBuffer(object):
4345
def __init__(self, graphname, client):
@@ -192,9 +194,10 @@ def process_entities(self, expected_col_count):
192194
# Add identifier->ID pair to dictionary if we are building relations
193195
if NODE_DICT is not None:
194196
if row[0] in NODE_DICT:
195-
print("Node identifier '%s' was used multiple times - second occurrence at %s:%d"
197+
sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
196198
% (row[0], self.infile.name, self.reader.line_num))
197-
exit(1)
199+
if CONFIGS.skip_invalid_nodes is False:
200+
exit(1)
198201
NODE_DICT[row[0]] = TOP_NODE_ID
199202
TOP_NODE_ID += 1
200203
row_binary = self.pack_props(row)
@@ -329,8 +332,9 @@ def process_entity_csvs(cls, csvs):
329332
@click.option('--max-buffer-size', '-b', default=2048, help='max buffer size in megabytes (default 2048)')
330333
@click.option('--max-token-size', '-t', default=500, help='max size of each token in megabytes (default 500, max 512)')
331334
@click.option('--quote-minimal/--no-quote-minimal', '-q/-d', default=False, help='only quote those fields which contain special characters such as delimiter, quotechar or any of the characters in lineterminator')
335+
@click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
332336

333-
def bulk_insert(graph, host, port, password, nodes, relations, max_token_count, max_buffer_size, max_token_size, quote_minimal):
337+
def bulk_insert(graph, host, port, password, nodes, relations, max_token_count, max_buffer_size, max_token_size, quote_minimal, skip_invalid_nodes):
334338
global CONFIGS
335339
global NODE_DICT
336340
global TOP_NODE_ID
@@ -346,7 +350,7 @@ def bulk_insert(graph, host, port, password, nodes, relations, max_token_count,
346350
QUOTING=csv.QUOTE_NONE
347351

348352
TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
349-
CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size)
353+
CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes)
350354

351355
start_time = timer()
352356
# Attempt to connect to Redis server

0 commit comments

Comments
 (0)