Skip to content

Commit a83eee5

Browse files
renewoollerjeffreylovitz
authored andcommitted
adds ability to set field types explicitly (#9)
* allows any csv quote format * updates bulk loader to be able to set types explicitly * updates documentation and help * set default field types to none * adds exception when field types is malformed * puts type in repr * updates readme
1 parent 40de2c5 commit a83eee5

File tree

4 files changed

+67
-27
lines changed

4 files changed

+67
-27
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.vscode

README.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ bulk_insert.py GRAPHNAME [OPTIONS]
2626
| -t | --max-token-count INT | max number of tokens sent in each Redis query (default 1024) |
2727
| -b | --max-buffer-size INT | max batch size (MBs) of each Redis query (default 4096) |
2828
| -c | --max-token-size INT | max size (MBs) of each token sent to Redis (default 500) |
29-
| -q | --quote-minimal | enable smart quoting for items within the CSV |
29+
| -q | --quote | the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3 |
30+
| -f | --field-types | json to set explicit types for each field, format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string) |
3031

3132

3233
The only required arguments are the name to give the newly-created graph (which can appear anywhere) and at least one node CSV file.
@@ -39,6 +40,15 @@ python bulk_insert.py GRAPH_DEMO -n example/Person.csv -n example/Country.csv -r
3940
```
4041
The label (for nodes) or relationship type (for relationships) is derived from the base name of the input CSV file. In this example, we'll construct two sets of nodes, labeled `Person` and `Country`, and two types of relationships - `KNOWS` and `VISITED`.
4142

43+
The default behaviour is to infer the type for each row based on the value of each row, which can cause type mismatch problem. For example if a string property contains string values of 'false', 'true' or numbers. To avoid this, use --field-types to explicitly set the type for each column in the csv.
44+
EG, to explicitly set to string.
45+
46+
```
47+
python3 bulk_insert.py ROBOTS -f '{"Robots" : [3]}' -q1 -n example2/Robots.csv
48+
```
49+
50+
Notice that when -f isn't used, the robot name "30165" would be inserted as a number rather than a string which causes problems in RedisGraph when searching.
51+
4252
## Input constraints
4353
### Node identifiers
4454
- If both nodes and relations are being created, each node must be associated with a unique identifier.
@@ -55,6 +65,8 @@ The label (for nodes) or relationship type (for relationships) is derived from t
5565
- `numeric`: an unquoted value that can be read as a floating-point or integer type.
5666
- `string`: any field that is either quote-interpolated or cannot be casted to a numeric or boolean type.
5767
- `NULL`: an empty field.
68+
- Default behaviour is to infer the property type, attempting to cast it to null, float, boolean or string in that order.
69+
- If explicit type is required, for example, if a value is "1234" and it must not be inferred into a float, you can use the option -f to specify the type explicitly for each row being imported.
5870

5971
### Label file format:
6072
- Each row must have the same number of fields.
@@ -68,3 +80,4 @@ The label (for nodes) or relationship type (for relationships) is derived from t
6880
- The first two fields of each row are the source and destination node identifiers. The names of these fields in the header do not matter.
6981
- If the file has more than 2 fields, all subsequent fields are relationship properties that adhere to the same rules as node properties.
7082
- Described relationships are always considered to be directed (source->destination).
83+

bulk_insert.py

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66
from timeit import default_timer as timer
77
import redis
88
import click
9+
import json
910

1011
# Global variables
1112
CONFIGS = None # thresholds for batching Redis queries
1213
NODE_DICT = {} # global node dictionary
1314
TOP_NODE_ID = 0 # next ID to assign to a node
1415
QUERY_BUF = None # Buffer for query being constructed
1516

17+
FIELD_TYPES = None
18+
1619
# Custom error class for invalid inputs
1720
class CSVError(Exception):
1821
pass
@@ -155,8 +158,13 @@ def pack_header(self, header):
155158
# Convert a list of properties into a binary string
156159
def pack_props(self, line):
157160
props = []
158-
for field in line[self.prop_offset:]:
159-
props.append(prop_to_binary(field))
161+
for num, field in enumerate(line[self.prop_offset:]):
162+
try :
163+
FIELD_TYPES[self.entity_str][num]
164+
except :
165+
props.append(prop_to_binary(field, None))
166+
else :
167+
props.append(prop_to_binary(field, FIELD_TYPES[self.entity_str][num]))
160168

161169
return b''.join(p for p in props)
162170

@@ -278,31 +286,39 @@ def process_entities(self, expected_col_count):
278286

279287
# Convert a single CSV property field into a binary stream.
280288
# Supported property types are string, numeric, boolean, and NULL.
281-
def prop_to_binary(prop_str):
289+
# type is either Type.NUMERIC, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
290+
def prop_to_binary(prop_str, type):
282291
# All format strings start with an unsigned char to represent our Type enum
283292
format_str = "=B"
284293
if not prop_str:
285294
# An empty field indicates a NULL property
286295
return struct.pack(format_str, Type.NULL)
287296

288297
# If field can be cast to a float, allow it
289-
try:
290-
numeric_prop = float(prop_str)
291-
return struct.pack(format_str + "d", Type.NUMERIC, numeric_prop)
292-
except:
293-
pass
294-
295-
# If field is 'false' or 'true', it is a boolean
296-
if prop_str.lower() == 'false':
297-
return struct.pack(format_str + '?', Type.BOOL, False)
298-
elif prop_str.lower() == 'true':
299-
return struct.pack(format_str + '?', Type.BOOL, True)
300-
301-
# If we've reached this point, the property is a string
302-
encoded_str = str.encode(prop_str) # struct.pack requires bytes objects as arguments
303-
# Encoding len+1 adds a null terminator to the string
304-
format_str += "%ds" % (len(encoded_str) + 1)
305-
return struct.pack(format_str, Type.STRING, encoded_str)
298+
299+
if type == None or type == Type.NUMERIC:
300+
try:
301+
numeric_prop = float(prop_str)
302+
return struct.pack(format_str + "d", Type.NUMERIC, numeric_prop)
303+
except:
304+
pass
305+
306+
if type == None or type == Type.BOOL:
307+
# If field is 'false' or 'true', it is a boolean
308+
if prop_str.lower() == 'false':
309+
return struct.pack(format_str + '?', Type.BOOL, False)
310+
elif prop_str.lower() == 'true':
311+
return struct.pack(format_str + '?', Type.BOOL, True)
312+
313+
if type == None or type == Type.STRING:
314+
# If we've reached this point, the property is a string
315+
encoded_str = str.encode(prop_str) # struct.pack requires bytes objects as arguments
316+
# Encoding len+1 adds a null terminator to the string
317+
format_str += "%ds" % (len(encoded_str) + 1)
318+
return struct.pack(format_str, Type.STRING, encoded_str)
319+
320+
## if it hasn't returned by this point, it is trying to set it to a type that it can't adopt
321+
raise Exception("unable to parse [" + prop_str + "] with type ["+repr(type)+"]")
306322

307323
# For each node input file, validate contents and convert to binary format.
308324
# If any buffer limits have been reached, flush all enqueued inserts to Redis.
@@ -336,25 +352,30 @@ def process_entity_csvs(cls, csvs, separator):
336352
@click.option('--max-token-count', '-c', default=1024, help='max number of processed CSVs to send per query (default 1024)')
337353
@click.option('--max-buffer-size', '-b', default=2048, help='max buffer size in megabytes (default 2048)')
338354
@click.option('--max-token-size', '-t', default=500, help='max size of each token in megabytes (default 500, max 512)')
339-
@click.option('--quote-minimal/--no-quote-minimal', '-q/-d', default=False, help='only quote those fields which contain special characters such as delimiter, quotechar or any of the characters in lineterminator')
355+
@click.option('--quote', '-q', default=3, help='the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3')
356+
@click.option('--field-types', '-f', default=None, help='json to set explicit types for each field, format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string)')
340357
@click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
341358
@click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
342359

343360

344-
def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote_minimal, skip_invalid_nodes, skip_invalid_edges):
361+
def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, field_types, skip_invalid_nodes, skip_invalid_edges):
345362
global CONFIGS
346363
global NODE_DICT
347364
global TOP_NODE_ID
348365
global QUERY_BUF
349366
global QUOTING
367+
global FIELD_TYPES
350368

351369
if sys.version_info[0] < 3:
352370
raise Exception("Python 3 is required for the RedisGraph bulk loader.")
353371

354-
if quote_minimal:
355-
QUOTING=csv.QUOTE_MINIMAL
356-
else:
357-
QUOTING=csv.QUOTE_NONE
372+
if field_types != None:
373+
try :
374+
FIELD_TYPES = json.loads(field_types)
375+
except:
376+
raise Exception("Problem parsing field-types. Use the format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string) ")
377+
378+
QUOTING=int(quote)
358379

359380
TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
360381
CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges)

example2/Robots.csv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"name"
2+
"Beebop"
3+
"30165"
4+
"Chappy"
5+
"Wal-e"

0 commit comments

Comments
 (0)