Reduce per file resources arrow uses (#77)
Some checks failed
Lint with Black / lint (push) Has been cancelled
Lint with isort / lint (push) Has been cancelled

Summary:

Test Plan:
This commit is contained in:
Pedro Rodriguez 2025-03-05 15:03:42 -08:00 committed by GitHub
parent 8f2cf8899d
commit 63913e4dba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 68 additions and 3 deletions

View file

@ -226,7 +226,13 @@ class ArrowFileIterator(StatefulIterator):
if (self.row_num - 1) % self.num_workers == self.worker_id:
yield out
self.batch_iterator = self.dataset.to_batches(batch_size=self.arrow_batch_size)
self.batch_iterator = self.dataset.to_batches(
batch_size=self.arrow_batch_size,
# We have large files in GBs, no need to readahead
fragment_readahead=1,
# Don't readahead in case batches are huge (e.g., books)
batch_readahead=1,
)
for batch in self.batch_iterator:
batch_columns = batch.to_pydict()
if self.file_format == "arrow":