mirror of
https://github.com/facebookresearch/blt.git
synced 2025-09-10 22:34:37 +00:00
Initial commit
This commit is contained in:
commit
bcc039bb75
86 changed files with 12203 additions and 0 deletions
74
bytelatent/preprocess/data_pipeline.py
Normal file
74
bytelatent/preprocess/data_pipeline.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import luigi
|
||||
|
||||
# CHANGEME: Change this to point to your data
|
||||
BASE_DIR = Path("datasets")
|
||||
DATASETS = ["dclm"]
|
||||
TARGET_DIR = Path("entropy_preprocess")
|
||||
|
||||
SHARD_SCRIPT = """split -C 2500m -d {source} {destination}.shard_"""
|
||||
|
||||
|
||||
def list_dataset_shards(dataset: str):
|
||||
dataset_dir = BASE_DIR / dataset
|
||||
return list(dataset_dir.glob("*.chunk.*.jsonl"))
|
||||
|
||||
|
||||
class ChunkFile(luigi.ExternalTask):
|
||||
file = luigi.Parameter()
|
||||
|
||||
def output(self):
|
||||
return luigi.LocalTarget(self.file)
|
||||
|
||||
|
||||
class ShardDatasetChunk(luigi.Task):
|
||||
dataset_name = luigi.Parameter()
|
||||
chunk_file = luigi.Parameter()
|
||||
|
||||
def _chunk_filename(self):
|
||||
return Path(self.chunk_file).name
|
||||
|
||||
def requires(self):
|
||||
return ChunkFile(self.chunk_file)
|
||||
|
||||
def run(self):
|
||||
destination_dir = TARGET_DIR / str(self.dataset_name)
|
||||
destination_dir.mkdir(parents=True, exist_ok=True)
|
||||
destination = destination_dir / self._chunk_filename()
|
||||
subprocess.check_output(
|
||||
SHARD_SCRIPT.format(source=str(self.chunk_file), destination=destination),
|
||||
shell=True,
|
||||
)
|
||||
(
|
||||
Path(TARGET_DIR)
|
||||
/ str(self.dataset_name)
|
||||
/ f"{self._chunk_filename()}.shard.COMPLETE"
|
||||
).touch()
|
||||
|
||||
def output(self):
|
||||
return luigi.LocalTarget(
|
||||
TARGET_DIR
|
||||
/ str(self.dataset_name)
|
||||
/ f"{self._chunk_filename()}.shard.COMPLETE"
|
||||
)
|
||||
|
||||
|
||||
class ShardDataset(luigi.WrapperTask):
|
||||
dataset_name = luigi.Parameter()
|
||||
|
||||
def requires(self):
|
||||
for f in list_dataset_shards(self.dataset_name):
|
||||
yield ShardDatasetChunk(dataset_name=self.dataset_name, chunk_file=str(f))
|
||||
|
||||
|
||||
class ShardAllDatasets(luigi.WrapperTask):
|
||||
def requires(self):
|
||||
for d in DATASETS:
|
||||
yield ShardDataset(dataset_name=d)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
luigi.build([ShardAllDatasets()], local_scheduler=True, workers=128)
|
Loading…
Add table
Add a link
Reference in a new issue