amazon_book_downloader/download_full_book.py
PixelMelt 3f1c7e8e27 done
2025-10-15 15:37:45 -04:00

252 lines
8.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Download complete book by downloading 5 pages at a time in a single session.
This ensures all pages share the same font/glyph encoding.
Strategy:
1. Download from start position (includes TOC) - 5 pages at a time
2. Keep downloading until we reach the end
3. All downloads in ONE session so fonts match
4. Use TOC from first download to build glyph mapping
5. Decode all pages using that single mapping
"""
import json
import sys
from pathlib import Path
from downloader import KindleDownloader
def main():
if len(sys.argv) < 2:
print("Usage: python3 download_full_book.py <ASIN> [--yes]")
sys.exit(1)
asin = sys.argv[1]
auto_confirm = '--yes' in sys.argv or '-y' in sys.argv
output_base = Path(f'downloads/{asin}')
# Ensure output directory exists
try:
output_base.mkdir(parents=True, exist_ok=True)
print(f"[✓] Output directory ready: {output_base}/")
except Exception as e:
print(f"[✗] ERROR: Cannot create output directory {output_base}: {e}")
sys.exit(1)
# Load credentials
headers_file = Path('headers.json')
if not headers_file.exists():
print("[✗] ERROR: headers.json not found!")
print("\nCreate headers.json in the current directory with:")
print(' {')
print(' "headers": {"x-adp-session-token": "..."},')
print(' "cookies": "session-id=...; ..."')
print(' }')
sys.exit(1)
try:
with open(headers_file) as f:
headers_data = json.load(f)
except json.JSONDecodeError as e:
print(f"[✗] ERROR: Invalid JSON in headers.json: {e}")
print("\nEnsure headers.json is valid JSON format")
sys.exit(1)
except Exception as e:
print(f"[✗] ERROR: Cannot read headers.json: {e}")
sys.exit(1)
# Validate headers structure
if not isinstance(headers_data, dict):
print("[✗] ERROR: headers.json must contain a JSON object")
sys.exit(1)
cookies = headers_data.get('cookies', '')
if not cookies:
print("[✗] ERROR: No 'cookies' field found in headers.json!")
print("\nEnsure headers.json contains:")
print(' {')
print(' "cookies": "session-id=...; ..."')
print(' }')
sys.exit(1)
if not cookies.strip():
print("[✗] ERROR: 'cookies' field is empty in headers.json!")
sys.exit(1)
adp_token = None
if 'headers' in headers_data:
if not isinstance(headers_data['headers'], dict):
print("[⚠] WARNING: 'headers' field is not a JSON object, ignoring")
else:
adp_token = headers_data['headers'].get('x-adp-session-token')
if not adp_token:
print("[⚠] WARNING: No 'x-adp-session-token' found in headers")
# Initialize downloader (single session for entire book)
print(f"\n{'='*80}")
print(f"DOWNLOADING COMPLETE BOOK: {asin}")
print(f"{'='*80}\n")
downloader = KindleDownloader(cookies, adp_token)
# Get book metadata
print("[*] Getting book metadata...")
try:
metadata = downloader.start_reading(asin)
except Exception as e:
print(f"[✗] ERROR: Failed to get book metadata: {e}")
print("\nPossible issues:")
print(" - Invalid ASIN")
print(" - Invalid or expired credentials in headers.json")
print(" - Network connection problem")
sys.exit(1)
title = metadata.get('deliveredAsin', asin)
revision = metadata.get('contentVersion', '')
start_pos = metadata.get('srl', 0)
print(f"[*] Title: {title}")
print(f"[*] Revision: {revision}")
print(f"[*] Default start position (srl): {start_pos}")
print(f"[*] Downloading from position 0 to include front matter (TOC, cover, etc)")
# Save karamelToken for image decryption
if 'karamelToken' in metadata:
karamel_token = {
'token': metadata['karamelToken']['token'],
'expiresAt': metadata['karamelToken']['expiresAt']
}
token_file = output_base / 'karamel_token.json'
with open(token_file, 'w') as f:
json.dump(karamel_token, f, indent=2)
print(f"[✓] Saved karamelToken to {token_file}")
# Download from position 0 to get the complete book including front matter
print(f"\n[*] Batch 0: position 0...")
try:
first_tar = downloader.render_pages(asin, revision, start_position=0, num_pages=5)
batch_0_dir = output_base / 'batch_0'
batch_0_dir.mkdir(parents=True, exist_ok=True)
first_files = downloader.extract_tar(first_tar, batch_0_dir)
except Exception as e:
print(f"[✗] ERROR: Failed to download batch 0: {e}")
sys.exit(1)
# Get position range from batch 0
try:
page_data_files = list((output_base / 'batch_0').glob('page_data_*.json'))
if not page_data_files:
print("[✗] ERROR: No page_data_*.json found in batch 0")
sys.exit(1)
page_data_file = page_data_files[0]
with open(page_data_file) as f:
first_pages = json.load(f)
if not first_pages:
print("[✗] ERROR: No pages found in batch 0")
sys.exit(1)
batch_0_start = first_pages[0]['startPositionId']
batch_0_end = first_pages[-1]['endPositionId']
print(f"[✓] Batch 0: {batch_0_start} to {batch_0_end} ({len(first_files)} files)")
except Exception as e:
print(f"[✗] ERROR: Failed to parse batch 0 data: {e}")
sys.exit(1)
# Load TOC to estimate book length
toc_file = output_base / 'batch_0' / 'toc.json'
if not toc_file.exists():
print("[✗] ERROR: toc.json not found in batch 0")
sys.exit(1)
try:
with open(toc_file) as f:
toc = json.load(f)
except Exception as e:
print(f"[✗] ERROR: Failed to parse toc.json: {e}")
sys.exit(1)
last_toc_pos = max(entry['tocPositionId'] for entry in toc)
print(f"[*] Book ends around position {last_toc_pos}")
# Estimate number of batches
positions_per_batch = batch_0_end - batch_0_start
estimated_batches = int((last_toc_pos - start_pos) / positions_per_batch) + 1
print(f"[*] Estimated {estimated_batches} batches needed (~{positions_per_batch} positions per 5 pages)")
print(f"\n[!] WARNING: This will download the entire book!")
print(f"[!] Estimated total: {estimated_batches * 5} pages")
if not auto_confirm:
response = input(f"\nContinue? [y/N]: ")
if response.lower() != 'y':
print("[*] Aborted")
sys.exit(0)
else:
print("[*] Auto-confirmed with --yes flag")
# Download remaining batches starting from where batch_0 ended
current_pos = batch_0_end + 1
batch_num = 1
print(f"\n[*] Downloading remaining batches...")
while current_pos < last_toc_pos:
try:
print(f"\n[*] Batch {batch_num}: position {current_pos}...")
tar_data = downloader.render_pages(asin, revision, start_position=current_pos, num_pages=5)
# Ensure batch directory exists
batch_dir = output_base / f'batch_{batch_num}'
batch_dir.mkdir(parents=True, exist_ok=True)
files = downloader.extract_tar(tar_data, batch_dir)
# Get end position from this batch
page_files = list(batch_dir.glob('page_data_*.json'))
if not page_files:
print(f"[!] Batch {batch_num}: No page data found, stopping")
break
page_file = page_files[0]
with open(page_file) as f:
pages = json.load(f)
if pages:
batch_end = pages[-1]['endPositionId']
print(f"[✓] Batch {batch_num}: {pages[0]['startPositionId']} to {batch_end}")
current_pos = batch_end + 1
else:
print(f"[!] Batch {batch_num}: No pages returned, stopping")
break
batch_num += 1
except Exception as e:
print(f"[✗] Error downloading batch {batch_num}: {e}")
print(f"[!] Stopping at batch {batch_num}. Partial download may be available.")
break
print(f"\n{'='*80}")
print(f"[✓] DOWNLOAD COMPLETE")
print(f"[✓] Downloaded {batch_num} batches")
print(f"[✓] Saved to: {output_base}/")
print(f"{'='*80}\n")
# Save download metadata
download_info = {
'asin': asin,
'revision': revision,
'start_position': start_pos,
'total_batches': batch_num,
'pages_per_batch': 5,
'estimated_positions': f'{start_pos} to {current_pos}'
}
try:
with open(output_base / 'download_info.json', 'w') as f:
json.dump(download_info, f, indent=2)
print(f"[✓] Saved download metadata to {output_base / 'download_info.json'}")
except Exception as e:
print(f"[⚠] WARNING: Failed to save download metadata: {e}")
if __name__ == '__main__':
main()