Skip to content

Commit

Permalink
I think I have error handling better now
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Nov 14, 2024
1 parent 8217e49 commit 2c7686f
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 5 deletions.
18 changes: 14 additions & 4 deletions pdelfin/beakerpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,11 +366,12 @@ async def process_pdf(args, session: aiohttp.ClientSession, worker_id: int, pdf_

# Collect the results from the entire task group, assuming no exceptions
page_results = [task.result() for task in page_tasks]
return build_dolma_document(pdf_s3_path, page_results)
except Exception as e:
logger.exception(f"Exception in process_pdf for {pdf_s3_path}: {e}")
raise

return build_dolma_document(pdf_s3_path, page_results)
# You can't build a dolma doc with even 1 failed page, so just get out of here
# However, you don't want to propagate an exception higher up and cancel the entire work_group
return None


def build_dolma_document(pdf_s3_path, page_results):
Expand Down Expand Up @@ -435,7 +436,16 @@ async def worker(args, queue, semaphore, worker_id):
async with asyncio.TaskGroup() as tg:
dolma_tasks = [tg.create_task(process_pdf(args, session, worker_id, pdf)) for pdf in pdfs]

dolma_docs = [task.result() for task in dolma_tasks if task.result() is not None]
dolma_docs = []
for task in dolma_tasks:
try:
result = task.result()
except:
# some dolma doc creations may have failed
pass

if result is not None:
dolma_docs.append(result)

# Write the Dolma documents to a local temporary file in JSONL format
with tempfile.NamedTemporaryFile(mode='w+', delete=False) as tf:
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "12"
_PATCH = "13"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down

0 comments on commit 2c7686f

Please sign in to comment.