2024-12-12 23:32:30 +00:00
|
|
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import os
|
|
|
|
import subprocess
|
|
|
|
import time
|
|
|
|
|
2025-01-10 19:04:41 +00:00
|
|
|
import fsspec
|
2024-12-12 23:32:30 +00:00
|
|
|
import requests
|
|
|
|
from huggingface_hub import snapshot_download
|
|
|
|
|
|
|
|
|
|
|
|
def run_command(command):
|
|
|
|
print(f"Running: {command}")
|
|
|
|
subprocess.run(command, shell=True, check=True)
|
|
|
|
|
|
|
|
|
|
|
|
def download_dataset(repo_id, local_dir, allow_patterns):
|
|
|
|
print(f"Downloading dataset from {repo_id}...")
|
|
|
|
max_retries = 5
|
|
|
|
retry_delay = 10 # seconds
|
|
|
|
for attempt in range(max_retries):
|
|
|
|
try:
|
|
|
|
snapshot_download(
|
|
|
|
repo_id,
|
|
|
|
repo_type="dataset",
|
|
|
|
local_dir=local_dir,
|
|
|
|
allow_patterns=allow_patterns,
|
|
|
|
resume_download=True,
|
|
|
|
max_workers=16, # Don't hesitate to increase this number to lower the download time
|
|
|
|
)
|
|
|
|
break
|
|
|
|
except requests.exceptions.ReadTimeout:
|
|
|
|
if attempt < max_retries - 1:
|
|
|
|
print(f"Timeout occurred. Retrying in {retry_delay} seconds...")
|
|
|
|
time.sleep(retry_delay)
|
|
|
|
else:
|
|
|
|
raise
|
|
|
|
print(f"Dataset downloaded to {local_dir}")
|
|
|
|
|
|
|
|
|
2025-01-10 19:04:41 +00:00
|
|
|
def parquet_to_jsonl(
|
|
|
|
dataset, work_dir, src_dir, tgt_dir, ntasks=64, s3_profile: str | None = None
|
|
|
|
):
|
2024-12-12 23:32:30 +00:00
|
|
|
from datatrove.executor import LocalPipelineExecutor
|
|
|
|
from datatrove.pipeline.readers import ParquetReader
|
|
|
|
from datatrove.pipeline.writers import JsonlWriter
|
|
|
|
|
2025-01-10 19:04:41 +00:00
|
|
|
if tgt_dir.startswith("s3//"):
|
|
|
|
if s3_profile is None:
|
|
|
|
out_spec = tgt_dir
|
|
|
|
else:
|
|
|
|
out_spec = (tgt_dir, fsspec.filesystem("s3", profile=s3_profile))
|
|
|
|
else:
|
|
|
|
out_spec = tgt_dir
|
|
|
|
|
2024-12-12 23:32:30 +00:00
|
|
|
pipeline_exec = LocalPipelineExecutor(
|
|
|
|
pipeline=[
|
|
|
|
ParquetReader(
|
|
|
|
src_dir,
|
|
|
|
file_progress=True,
|
|
|
|
doc_progress=True,
|
|
|
|
glob_pattern="**/*.parquet",
|
|
|
|
),
|
|
|
|
JsonlWriter(
|
2025-01-10 19:04:41 +00:00
|
|
|
out_spec,
|
2024-12-12 23:32:30 +00:00
|
|
|
output_filename=dataset + ".chunk.${rank}.jsonl",
|
|
|
|
compression=None,
|
|
|
|
),
|
|
|
|
],
|
|
|
|
tasks=ntasks,
|
|
|
|
logging_dir=os.path.join(work_dir, "datatrove"),
|
|
|
|
)
|
|
|
|
pipeline_exec.run()
|
|
|
|
|
|
|
|
|
|
|
|
def setup_terashuf(work_dir):
|
|
|
|
terashuf_dir = os.path.join(work_dir, "terashuf")
|
|
|
|
terashuf_executable = os.path.join(terashuf_dir, "terashuf")
|
|
|
|
|
|
|
|
if os.path.exists(terashuf_executable):
|
|
|
|
print("terashuf executable already exists. Skipping setup.")
|
|
|
|
return terashuf_dir
|
|
|
|
|
|
|
|
print("Setting up terashuf...")
|
|
|
|
run_command(f"git clone https://github.com/alexandres/terashuf {terashuf_dir}")
|
|
|
|
run_command(f"make -C {terashuf_dir}")
|
|
|
|
return terashuf_dir
|
|
|
|
|
|
|
|
|
2025-01-10 19:04:41 +00:00
|
|
|
def main(dataset, memory, data_dir, seed=42, nchunks=32, s3_profile: str | None = None):
|
2024-12-12 23:32:30 +00:00
|
|
|
# Configuration
|
|
|
|
repo_id = {
|
|
|
|
"fineweb_edu": "HuggingFaceFW/fineweb-edu",
|
|
|
|
"fineweb_edu_10bt": "HuggingFaceFW/fineweb-edu",
|
|
|
|
"dclm_baseline_1.0": "mlfoundations/dclm-baseline-1.0",
|
|
|
|
"dclm_baseline_1.0_10prct": "mlfoundations/dclm-baseline-1.0",
|
|
|
|
}[dataset]
|
|
|
|
src_dir = f"{data_dir}/{dataset}"
|
|
|
|
out_dir = f"{src_dir}_shuffled"
|
|
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
|
|
work_dir = src_dir # Directory of this Python file
|
|
|
|
prefix = f"{dataset}.chunk."
|
|
|
|
orig_extension = {
|
|
|
|
"fineweb_edu": ".jsonl",
|
|
|
|
"fineweb_edu_10bt": ".jsonl",
|
|
|
|
"dclm_baseline_1.0": ".jsonl.zst",
|
|
|
|
"dclm_baseline_1.0_10prct": ".jsonl.zst",
|
|
|
|
}[dataset]
|
|
|
|
cat_command = {
|
|
|
|
"fineweb_edu": "cat",
|
|
|
|
"fineweb_edu_10bt": "cat",
|
|
|
|
"dclm_baseline_1.0": "zstdcat",
|
|
|
|
"dclm_baseline_1.0_10prct": "zstdcat",
|
|
|
|
}[dataset]
|
|
|
|
allow_patterns = {
|
|
|
|
"fineweb_edu": None,
|
|
|
|
"fineweb_edu_10bt": "sample/10BT/*",
|
|
|
|
"dclm_baseline_1.0": "*.jsonl.zst",
|
|
|
|
"dclm_baseline_1.0_10prct": "global-shard_01_of_10/*.jsonl.zst",
|
|
|
|
}[dataset]
|
|
|
|
suffix = ".jsonl"
|
|
|
|
k_validation = 10000 # Number of lines to take from each chunk for validation
|
|
|
|
|
|
|
|
# Setup terashuf
|
|
|
|
terashuf_dir = setup_terashuf(work_dir)
|
|
|
|
|
|
|
|
# Download dataset
|
|
|
|
download_dataset(repo_id, src_dir, allow_patterns)
|
|
|
|
|
|
|
|
if "fineweb" in dataset:
|
|
|
|
parquet_to_jsonl(dataset, work_dir, src_dir, src_dir)
|
|
|
|
|
|
|
|
# Set up environment variables
|
|
|
|
os.environ["MEMORY"] = f"{memory}"
|
|
|
|
os.environ["SEED"] = f"{seed}"
|
|
|
|
|
|
|
|
# Run the original shuffling and splitting command
|
|
|
|
terashuf_executable = os.path.join(terashuf_dir, "terashuf")
|
|
|
|
run_command(
|
|
|
|
f"ulimit -n 100000 && "
|
|
|
|
f"find {src_dir} -type f -name '*{orig_extension}' -print0 | xargs -0 {cat_command} | {terashuf_executable} | "
|
|
|
|
f"split -n r/{nchunks} -d --suffix-length 2 --additional-suffix {suffix} - {out_dir}/{prefix}"
|
|
|
|
"; trap 'echo \"Caught signal 13, exiting with code 1\"; exit 1' SIGPIPE;"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Create validation set and remove lines from chunks
|
|
|
|
validation_file = f"{out_dir}/{dataset}.val{suffix}"
|
|
|
|
for i in range(nchunks):
|
|
|
|
chunk_file = f"{out_dir}/{prefix}{i:02d}{suffix}"
|
|
|
|
run_command(f"head -n {k_validation} {chunk_file} >> {validation_file}")
|
|
|
|
run_command(f"sed -i '1,{k_validation}d' {chunk_file}")
|
|
|
|
|
|
|
|
print("All tasks completed successfully!")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("dataset", type=str)
|
|
|
|
parser.add_argument("memory", type=float, default=8)
|
|
|
|
parser.add_argument("--data_dir", type=str, default="data")
|
|
|
|
parser.add_argument("--seed", type=int, default=42)
|
|
|
|
parser.add_argument("--nchunks", type=int, default=32)
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
main(args.dataset, args.memory, args.data_dir, args.seed, args.nchunks)
|