Skip to content

Commit 806a01c

Browse files
Merge pull request openai#448 from domenicomanna/parallel-processor-add-metadata
Add ability to store row-level metadata
2 parents a148d0b + a4c088d commit 806a01c

File tree

2 files changed

+17
-5
lines changed

2 files changed

+17
-5
lines changed

examples/api_request_parallel_processor.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@
3131
Inputs:
3232
- requests_filepath : str
3333
- path to the file containing the requests to be processed
34-
- file should be a jsonl file, where each line is a json object with API parameters
35-
- e.g., {"model": "text-embedding-ada-002", "input": "embed me"}
34+
- file should be a jsonl file, where each line is a json object with API parameters and an optional metadata field
35+
- e.g., {"model": "text-embedding-ada-002", "input": "embed me", "metadata": {"row_id": 1}}
3636
- as with all jsonl files, take care that newlines in the content are properly escaped (json.dumps does this automatically)
3737
- an example file is provided at examples/data/example_requests_to_parallel_process.jsonl
3838
- the code to generate the example file is appended to the bottom of this script
@@ -164,6 +164,7 @@ async def process_api_requests_from_file(
164164
request_json=request_json,
165165
token_consumption=num_tokens_consumed_from_request(request_json, api_endpoint, token_encoding_name),
166166
attempts_left=max_attempts,
167+
metadata=request_json.pop("metadata", None)
167168
)
168169
status_tracker.num_tasks_started += 1
169170
status_tracker.num_tasks_in_progress += 1
@@ -258,6 +259,7 @@ class APIRequest:
258259
request_json: dict
259260
token_consumption: int
260261
attempts_left: int
262+
metadata: dict
261263
result: list = field(default_factory=list)
262264

263265
async def call_api(
@@ -298,11 +300,21 @@ async def call_api(
298300
retry_queue.put_nowait(self)
299301
else:
300302
logging.error(f"Request {self.request_json} failed after all attempts. Saving errors: {self.result}")
301-
append_to_jsonl([self.request_json, [str(e) for e in self.result]], save_filepath)
303+
data = (
304+
[self.request_json, [str(e) for e in self.result], self.metadata]
305+
if self.metadata
306+
else [self.request_json, [str(e) for e in self.result]]
307+
)
308+
append_to_jsonl(data, save_filepath)
302309
status_tracker.num_tasks_in_progress -= 1
303310
status_tracker.num_tasks_failed += 1
304311
else:
305-
append_to_jsonl([self.request_json, response], save_filepath)
312+
data = (
313+
[self.request_json, response, self.metadata]
314+
if self.metadata
315+
else [self.request_json, response]
316+
)
317+
append_to_jsonl(data, save_filepath)
306318
status_tracker.num_tasks_in_progress -= 1
307319
status_tracker.num_tasks_succeeded += 1
308320
logging.debug(f"Request {self.task_id} saved to {save_filepath}")

examples/data/example_requests_to_parallel_process.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{"model": "text-embedding-ada-002", "input": "0\n"}
1+
{"model": "text-embedding-ada-002", "input": "0\n", "metadata": {"row_id": 1}}
22
{"model": "text-embedding-ada-002", "input": "1\n"}
33
{"model": "text-embedding-ada-002", "input": "2\n"}
44
{"model": "text-embedding-ada-002", "input": "3\n"}

0 commit comments

Comments
 (0)