mirror of
https://github.com/gudvardur/amazon_book_downloader.git
synced 2026-04-28 03:20:19 +00:00
done
This commit is contained in:
parent
4f5f71bdaf
commit
3f1c7e8e27
5 changed files with 179 additions and 38 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
|
@ -1,7 +1,8 @@
|
|||
/BLOG.MD
|
||||
/__pycache__
|
||||
/archive
|
||||
/downloads
|
||||
/downloads/*
|
||||
/headers.json
|
||||
/renderer.js
|
||||
/ttf_character_mapping.json
|
||||
/ttf_character_mapping.json
|
||||
/decoded_book.epub
|
||||
Binary file not shown.
|
|
@ -23,19 +23,63 @@ def main():
|
|||
asin = sys.argv[1]
|
||||
auto_confirm = '--yes' in sys.argv or '-y' in sys.argv
|
||||
output_base = Path(f'downloads/{asin}')
|
||||
output_base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Ensure output directory exists
|
||||
try:
|
||||
output_base.mkdir(parents=True, exist_ok=True)
|
||||
print(f"[✓] Output directory ready: {output_base}/")
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Cannot create output directory {output_base}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load credentials
|
||||
headers_file = Path('headers.json')
|
||||
if not headers_file.exists():
|
||||
print("[✗] headers.json not found!")
|
||||
print("[✗] ERROR: headers.json not found!")
|
||||
print("\nCreate headers.json in the current directory with:")
|
||||
print(' {')
|
||||
print(' "headers": {"x-adp-session-token": "..."},')
|
||||
print(' "cookies": "session-id=...; ..."')
|
||||
print(' }')
|
||||
sys.exit(1)
|
||||
|
||||
with open(headers_file) as f:
|
||||
headers_data = json.load(f)
|
||||
try:
|
||||
with open(headers_file) as f:
|
||||
headers_data = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"[✗] ERROR: Invalid JSON in headers.json: {e}")
|
||||
print("\nEnsure headers.json is valid JSON format")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Cannot read headers.json: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Validate headers structure
|
||||
if not isinstance(headers_data, dict):
|
||||
print("[✗] ERROR: headers.json must contain a JSON object")
|
||||
sys.exit(1)
|
||||
|
||||
cookies = headers_data.get('cookies', '')
|
||||
adp_token = headers_data['headers'].get('x-adp-session-token') if 'headers' in headers_data else None
|
||||
if not cookies:
|
||||
print("[✗] ERROR: No 'cookies' field found in headers.json!")
|
||||
print("\nEnsure headers.json contains:")
|
||||
print(' {')
|
||||
print(' "cookies": "session-id=...; ..."')
|
||||
print(' }')
|
||||
sys.exit(1)
|
||||
|
||||
if not cookies.strip():
|
||||
print("[✗] ERROR: 'cookies' field is empty in headers.json!")
|
||||
sys.exit(1)
|
||||
|
||||
adp_token = None
|
||||
if 'headers' in headers_data:
|
||||
if not isinstance(headers_data['headers'], dict):
|
||||
print("[⚠] WARNING: 'headers' field is not a JSON object, ignoring")
|
||||
else:
|
||||
adp_token = headers_data['headers'].get('x-adp-session-token')
|
||||
if not adp_token:
|
||||
print("[⚠] WARNING: No 'x-adp-session-token' found in headers")
|
||||
|
||||
# Initialize downloader (single session for entire book)
|
||||
print(f"\n{'='*80}")
|
||||
|
|
@ -46,7 +90,15 @@ def main():
|
|||
|
||||
# Get book metadata
|
||||
print("[*] Getting book metadata...")
|
||||
metadata = downloader.start_reading(asin)
|
||||
try:
|
||||
metadata = downloader.start_reading(asin)
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Failed to get book metadata: {e}")
|
||||
print("\nPossible issues:")
|
||||
print(" - Invalid ASIN")
|
||||
print(" - Invalid or expired credentials in headers.json")
|
||||
print(" - Network connection problem")
|
||||
sys.exit(1)
|
||||
|
||||
title = metadata.get('deliveredAsin', asin)
|
||||
revision = metadata.get('contentVersion', '')
|
||||
|
|
@ -70,22 +122,48 @@ def main():
|
|||
|
||||
# Download from position 0 to get the complete book including front matter
|
||||
print(f"\n[*] Batch 0: position 0...")
|
||||
first_tar = downloader.render_pages(asin, revision, start_position=0, num_pages=5)
|
||||
first_files = downloader.extract_tar(first_tar, output_base / 'batch_0')
|
||||
try:
|
||||
first_tar = downloader.render_pages(asin, revision, start_position=0, num_pages=5)
|
||||
batch_0_dir = output_base / 'batch_0'
|
||||
batch_0_dir.mkdir(parents=True, exist_ok=True)
|
||||
first_files = downloader.extract_tar(first_tar, batch_0_dir)
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Failed to download batch 0: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Get position range from batch 0
|
||||
page_data_file = list((output_base / 'batch_0').glob('page_data_*.json'))[0]
|
||||
with open(page_data_file) as f:
|
||||
first_pages = json.load(f)
|
||||
try:
|
||||
page_data_files = list((output_base / 'batch_0').glob('page_data_*.json'))
|
||||
if not page_data_files:
|
||||
print("[✗] ERROR: No page_data_*.json found in batch 0")
|
||||
sys.exit(1)
|
||||
page_data_file = page_data_files[0]
|
||||
with open(page_data_file) as f:
|
||||
first_pages = json.load(f)
|
||||
|
||||
batch_0_start = first_pages[0]['startPositionId']
|
||||
batch_0_end = first_pages[-1]['endPositionId']
|
||||
print(f"[✓] Batch 0: {batch_0_start} to {batch_0_end} ({len(first_files)} files)")
|
||||
if not first_pages:
|
||||
print("[✗] ERROR: No pages found in batch 0")
|
||||
sys.exit(1)
|
||||
|
||||
batch_0_start = first_pages[0]['startPositionId']
|
||||
batch_0_end = first_pages[-1]['endPositionId']
|
||||
print(f"[✓] Batch 0: {batch_0_start} to {batch_0_end} ({len(first_files)} files)")
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Failed to parse batch 0 data: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load TOC to estimate book length
|
||||
toc_file = output_base / 'batch_0' / 'toc.json'
|
||||
with open(toc_file) as f:
|
||||
toc = json.load(f)
|
||||
if not toc_file.exists():
|
||||
print("[✗] ERROR: toc.json not found in batch 0")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
with open(toc_file) as f:
|
||||
toc = json.load(f)
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Failed to parse toc.json: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
last_toc_pos = max(entry['tocPositionId'] for entry in toc)
|
||||
print(f"[*] Book ends around position {last_toc_pos}")
|
||||
|
|
@ -116,10 +194,19 @@ def main():
|
|||
try:
|
||||
print(f"\n[*] Batch {batch_num}: position {current_pos}...")
|
||||
tar_data = downloader.render_pages(asin, revision, start_position=current_pos, num_pages=5)
|
||||
files = downloader.extract_tar(tar_data, output_base / f'batch_{batch_num}')
|
||||
|
||||
# Ensure batch directory exists
|
||||
batch_dir = output_base / f'batch_{batch_num}'
|
||||
batch_dir.mkdir(parents=True, exist_ok=True)
|
||||
files = downloader.extract_tar(tar_data, batch_dir)
|
||||
|
||||
# Get end position from this batch
|
||||
page_file = list((output_base / f'batch_{batch_num}').glob('page_data_*.json'))[0]
|
||||
page_files = list(batch_dir.glob('page_data_*.json'))
|
||||
if not page_files:
|
||||
print(f"[!] Batch {batch_num}: No page data found, stopping")
|
||||
break
|
||||
|
||||
page_file = page_files[0]
|
||||
with open(page_file) as f:
|
||||
pages = json.load(f)
|
||||
|
||||
|
|
@ -135,6 +222,7 @@ def main():
|
|||
|
||||
except Exception as e:
|
||||
print(f"[✗] Error downloading batch {batch_num}: {e}")
|
||||
print(f"[!] Stopping at batch {batch_num}. Partial download may be available.")
|
||||
break
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
|
|
@ -153,8 +241,12 @@ def main():
|
|||
'estimated_positions': f'{start_pos} to {current_pos}'
|
||||
}
|
||||
|
||||
with open(output_base / 'download_info.json', 'w') as f:
|
||||
json.dump(download_info, f, indent=2)
|
||||
try:
|
||||
with open(output_base / 'download_info.json', 'w') as f:
|
||||
json.dump(download_info, f, indent=2)
|
||||
print(f"[✓] Saved download metadata to {output_base / 'download_info.json'}")
|
||||
except Exception as e:
|
||||
print(f"[⚠] WARNING: Failed to save download metadata: {e}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -239,44 +239,86 @@ Examples:
|
|||
# Load credentials from headers.json
|
||||
headers_file = Path('headers.json')
|
||||
if not headers_file.exists():
|
||||
print("[✗] headers.json not found!")
|
||||
print("\nCreate headers.json with:")
|
||||
print("[✗] ERROR: headers.json not found!")
|
||||
print("\nCreate headers.json in the current directory with:")
|
||||
print(' {')
|
||||
print(' "headers": {"x-adp-session-token": "..."},')
|
||||
print(' "cookies": "session-id=...; ..."')
|
||||
print(' }')
|
||||
sys.exit(1)
|
||||
|
||||
with open(headers_file) as f:
|
||||
headers_data = json.load(f)
|
||||
try:
|
||||
with open(headers_file) as f:
|
||||
headers_data = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"[✗] ERROR: Invalid JSON in headers.json: {e}")
|
||||
print("\nEnsure headers.json is valid JSON format")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Cannot read headers.json: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Validate headers structure
|
||||
if not isinstance(headers_data, dict):
|
||||
print("[✗] ERROR: headers.json must contain a JSON object")
|
||||
sys.exit(1)
|
||||
|
||||
cookies = headers_data.get('cookies', '')
|
||||
if not cookies:
|
||||
print("[✗] No cookies found in headers.json!")
|
||||
print("[✗] ERROR: No 'cookies' field found in headers.json!")
|
||||
print("\nEnsure headers.json contains:")
|
||||
print(' {')
|
||||
print(' "cookies": "session-id=...; ..."')
|
||||
print(' }')
|
||||
sys.exit(1)
|
||||
|
||||
if not cookies.strip():
|
||||
print("[✗] ERROR: 'cookies' field is empty in headers.json!")
|
||||
sys.exit(1)
|
||||
|
||||
adp_token = None
|
||||
if 'headers' in headers_data:
|
||||
adp_token = headers_data['headers'].get('x-adp-session-token')
|
||||
if not isinstance(headers_data['headers'], dict):
|
||||
print("[⚠] WARNING: 'headers' field is not a JSON object, ignoring")
|
||||
else:
|
||||
adp_token = headers_data['headers'].get('x-adp-session-token')
|
||||
if not adp_token:
|
||||
print("[⚠] WARNING: No 'x-adp-session-token' found in headers")
|
||||
|
||||
# Download
|
||||
downloader = KindleDownloader(cookies, adp_token)
|
||||
|
||||
# Override start position if specified
|
||||
if args.start_position is not None:
|
||||
metadata = downloader.start_reading(args.asin)
|
||||
revision = metadata.get('contentVersion', '')
|
||||
try:
|
||||
metadata = downloader.start_reading(args.asin)
|
||||
revision = metadata.get('contentVersion', '')
|
||||
|
||||
# Download from custom position
|
||||
tar_data = downloader.render_pages(args.asin, revision, start_position=args.start_position, num_pages=args.pages)
|
||||
# Download from custom position
|
||||
tar_data = downloader.render_pages(args.asin, revision, start_position=args.start_position, num_pages=args.pages)
|
||||
|
||||
# Extract
|
||||
output_dir = args.output or f"downloads/{args.asin}"
|
||||
print(f"[*] Extracting to {output_dir}/...")
|
||||
extracted_files = downloader.extract_tar(tar_data, output_dir)
|
||||
print(f"[✓] Extracted {len(extracted_files)} files")
|
||||
# Extract
|
||||
output_dir = args.output or f"downloads/{args.asin}"
|
||||
print(f"[*] Extracting to {output_dir}/...")
|
||||
# Ensure output directory exists
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
extracted_files = downloader.extract_tar(tar_data, output_dir)
|
||||
print(f"[✓] Extracted {len(extracted_files)} files")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"[✗] ERROR: Network request failed: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Download failed: {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
downloader.download(args.asin, num_pages=args.pages, output_dir=args.output)
|
||||
try:
|
||||
downloader.download(args.asin, num_pages=args.pages, output_dir=args.output)
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"[✗] ERROR: Network request failed: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"[✗] ERROR: Download failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
|||
6
headers.example.json
Normal file
6
headers.example.json
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"headers": {
|
||||
|
||||
},
|
||||
"cookies": ""
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue