mirror of
https://github.com/gudvardur/amazon_book_downloader.git
synced 2026-04-28 03:20:19 +00:00
401 lines
14 KiB
Python
401 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Create an EPUB file from the decoded Amazon book data with proper formatting.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
import sys
|
|
from ebooklib import epub
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python3 create_epub_new.py <book_dir>")
|
|
sys.exit(1)
|
|
|
|
book_dir = Path(sys.argv[1])
|
|
|
|
# Load the TTF character mapping
|
|
mapping_file = Path("ttf_character_mapping.json")
|
|
if not mapping_file.exists():
|
|
print(f"Mapping file not found: {mapping_file}")
|
|
print("Run match_ttf_to_glyphs.py first!")
|
|
return
|
|
|
|
with open(mapping_file) as f:
|
|
char_mapping = json.load(f)
|
|
|
|
print(f"Loaded character mapping: {len(char_mapping)} glyphs")
|
|
|
|
# Load metadata
|
|
metadata_file = book_dir / 'batch_0' / 'metadata.json'
|
|
with open(metadata_file) as f:
|
|
metadata = json.load(f)
|
|
|
|
# Load TOC
|
|
toc_file = book_dir / 'batch_0' / 'toc.json'
|
|
with open(toc_file) as f:
|
|
toc_data = json.load(f)
|
|
|
|
print(f"Book: {metadata['bookTitle']}")
|
|
print(f"Author: {metadata['authors'][0]}")
|
|
print(f"TOC entries: {len(toc_data)}")
|
|
|
|
# Load all_glyphs
|
|
all_glyphs_file = book_dir / 'hash_mapping' / 'all_glyphs.json'
|
|
if not all_glyphs_file.exists():
|
|
print(f"Book file not found: {all_glyphs_file}")
|
|
return
|
|
|
|
with open(all_glyphs_file) as f:
|
|
all_glyphs = json.load(f)
|
|
|
|
print(f"Loaded {len(all_glyphs)} glyphs from all_glyphs.json")
|
|
|
|
# Build line ending info (where newlines go) - same as decode_book_with_newlines.py
|
|
print("Building line ending positions...")
|
|
batch_dirs = sorted([d for d in book_dir.iterdir() if d.is_dir() and d.name.startswith('batch_')],
|
|
key=lambda x: int(x.name.split('_')[1]))
|
|
|
|
line_info = {} # Index in all_glyphs -> formatting info
|
|
current_index = 0
|
|
|
|
# Get page dimensions from actual page data
|
|
first_page_file = batch_dirs[0] / sorted(batch_dirs[0].glob('page_data_*.json'))[0].name
|
|
with open(first_page_file) as f:
|
|
first_page_data = json.load(f)
|
|
page_width = first_page_data[0]['width']
|
|
page_height = first_page_data[0]['height']
|
|
|
|
print(f"Page dimensions: {page_width}x{page_height}")
|
|
|
|
prev_y = None # Track Y coordinate to detect line breaks
|
|
|
|
for batch_dir in batch_dirs:
|
|
page_files = sorted(batch_dir.glob('page_data_*.json'))
|
|
for page_file in page_files:
|
|
with open(page_file) as f:
|
|
pages = json.load(f)
|
|
|
|
for page in pages:
|
|
for run in page.get('children', []):
|
|
if 'glyphs' not in run:
|
|
continue
|
|
|
|
num_glyphs = len(run['glyphs'])
|
|
|
|
# Extract formatting info with transform applied
|
|
rect = run.get('rect', {})
|
|
transform = run.get('transform', [1, 0, 0, 1, 0, 0])
|
|
tx = transform[4] if len(transform) >= 6 else 0
|
|
ty = transform[5] if len(transform) >= 6 else 0
|
|
|
|
left = rect.get('left', 0) + tx
|
|
right = rect.get('right', 0) + tx
|
|
top = rect.get('top', 0) + ty
|
|
font_style = run.get('fontStyle', 'normal')
|
|
font_weight = run.get('fontWeight', 400)
|
|
font_size = run.get('fontSize', 8.91) # Default from downloader.py
|
|
has_link = 'link' in run
|
|
|
|
# Detect alignment type using relative thresholds
|
|
center = (left + right) / 2
|
|
page_center = page_width / 2
|
|
text_width = right - left
|
|
alignment = 'left'
|
|
|
|
# Use relative thresholds based on page width
|
|
center_tolerance = page_width * 0.05 # 5% of page width
|
|
edge_tolerance = page_width * 0.05 # 5% tolerance for edges
|
|
min_side_margin = page_width * 0.1 # 10% margin on each side for center
|
|
min_left_margin_right = page_width * 0.2 # 20% left margin for right-align
|
|
min_indent = page_width * 0.05 # 5% indent
|
|
max_indent = page_width * 0.15 # 15% max for paragraph indent
|
|
min_text_width = page_width * 0.3 # 30% minimum text width
|
|
|
|
# Check if centered: text center near page center AND margins on both sides
|
|
if abs(center - page_center) < center_tolerance and left > min_side_margin and (page_width - right) > min_side_margin:
|
|
alignment = 'center'
|
|
# Check if right-aligned: close to right edge with significant left margin
|
|
elif abs(right - page_width) < edge_tolerance and left > min_left_margin_right:
|
|
alignment = 'right'
|
|
# Check for indented paragraphs: moderate left margin with substantial text
|
|
elif min_indent < left < max_indent and text_width > min_text_width:
|
|
alignment = 'indent'
|
|
|
|
# Determine if this is a new line (Y coordinate changed significantly)
|
|
is_new_line = prev_y is None or abs(top - prev_y) > 5
|
|
|
|
# Store info for each glyph position in this run
|
|
for i in range(num_glyphs):
|
|
line_info[current_index + i] = {
|
|
'font_style': font_style,
|
|
'font_weight': font_weight,
|
|
'font_size': font_size,
|
|
'has_link': has_link,
|
|
'left': left,
|
|
'alignment': alignment
|
|
}
|
|
|
|
# Only mark line break if this run is on a NEW line
|
|
if is_new_line and current_index > 0:
|
|
# Mark line break at the END of the PREVIOUS run
|
|
line_info[current_index - 1]['line_break'] = True
|
|
|
|
current_index += num_glyphs
|
|
prev_y = top
|
|
|
|
print(f"Processed {current_index} glyphs with line break info")
|
|
|
|
# Create EPUB
|
|
print("Creating EPUB...")
|
|
book = epub.EpubBook()
|
|
|
|
# Set metadata
|
|
book.set_identifier(metadata.get('asin', 'unknown'))
|
|
book.set_title(metadata['bookTitle'])
|
|
book.set_language(metadata.get('lang', 'en'))
|
|
|
|
for author in metadata.get('authors', ['Unknown']):
|
|
book.add_author(author)
|
|
|
|
|
|
# Add CSS for styling - match Kindle rendering parameters
|
|
# Based on downloader.py: fontFamily='Bookerly', fontSize='8.91', lineHeight='1.4'
|
|
style = '''
|
|
body {
|
|
font-family: Bookerly, Georgia, serif;
|
|
font-size: 8pt;
|
|
line-height: 1.0;
|
|
margin: 0 auto;
|
|
padding: 0;
|
|
max-width: 1000px;
|
|
background-color: #ffffff;
|
|
color: #000000;
|
|
}
|
|
p {
|
|
margin: 0;
|
|
padding: 0;
|
|
line-height: 1.0;
|
|
}
|
|
p.center {
|
|
text-align: center;
|
|
}
|
|
p.right {
|
|
text-align: right;
|
|
}
|
|
p.indent {
|
|
text-indent: 2em;
|
|
}
|
|
p.break {
|
|
margin-top: 0.8em;
|
|
}
|
|
.italic { font-style: italic; }
|
|
.bold { font-weight: bold; }
|
|
.link {
|
|
color: #0066cc;
|
|
text-decoration: underline;
|
|
}
|
|
h1 {
|
|
font-size: 1.8em;
|
|
margin: 1em 0 0.5em 0;
|
|
font-weight: bold;
|
|
}
|
|
h2 {
|
|
font-size: 1.4em;
|
|
margin: 0.8em 0 0.4em 0;
|
|
font-weight: bold;
|
|
}
|
|
'''
|
|
|
|
default_css = epub.EpubItem(
|
|
uid="style_default",
|
|
file_name="style/default.css",
|
|
media_type="text/css",
|
|
content=style
|
|
)
|
|
book.add_item(default_css)
|
|
|
|
# Map position IDs to glyph indices
|
|
print("Mapping TOC positions to glyph indices...")
|
|
position_to_glyph_idx = {}
|
|
current_glyph_idx = 0
|
|
|
|
for batch_dir in batch_dirs:
|
|
page_files = sorted(batch_dir.glob('page_data_*.json'))
|
|
for page_file in page_files:
|
|
with open(page_file) as f:
|
|
pages = json.load(f)
|
|
|
|
for page in pages:
|
|
for run in page.get('children', []):
|
|
if 'glyphs' not in run:
|
|
continue
|
|
|
|
# Check if this run has position info
|
|
start_pos_id = run.get('startPositionId')
|
|
if start_pos_id is not None:
|
|
position_to_glyph_idx[start_pos_id] = current_glyph_idx
|
|
|
|
current_glyph_idx += len(run['glyphs'])
|
|
|
|
# Map TOC entries to glyph indices
|
|
toc_chapters = []
|
|
for i, toc_entry in enumerate(toc_data):
|
|
pos_id = toc_entry['tocPositionId']
|
|
if pos_id in position_to_glyph_idx:
|
|
toc_chapters.append({
|
|
'label': toc_entry['label'],
|
|
'glyph_idx': position_to_glyph_idx[pos_id],
|
|
'chapter_num': i
|
|
})
|
|
|
|
print(f"Found {len(toc_chapters)} TOC entries with positions")
|
|
|
|
# Build chapters based on TOC structure
|
|
print("Building chapters with formatting...")
|
|
import html
|
|
chapters = []
|
|
chapter_contents = {} # chapter_num -> content list
|
|
current_chapter_num = -1 # Start before first chapter
|
|
current_span_classes = []
|
|
consecutive_line_breaks = 0
|
|
|
|
for idx, glyph_id in enumerate(all_glyphs):
|
|
# Check if we're at a new chapter start
|
|
for toc_ch in toc_chapters:
|
|
if toc_ch['glyph_idx'] == idx:
|
|
current_chapter_num = toc_ch['chapter_num']
|
|
if current_chapter_num not in chapter_contents:
|
|
chapter_contents[current_chapter_num] = ['<p>']
|
|
break
|
|
|
|
# Skip content before first chapter
|
|
if current_chapter_num == -1:
|
|
continue
|
|
|
|
# Decode this glyph
|
|
glyph_key = str(glyph_id)
|
|
if glyph_key in char_mapping:
|
|
char = char_mapping[glyph_key]["character"]
|
|
else:
|
|
char = f"[{glyph_id}]"
|
|
|
|
# Get formatting for this position
|
|
info = line_info.get(idx, {})
|
|
font_style = info.get('font_style', 'normal')
|
|
font_weight = info.get('font_weight', 400)
|
|
font_size = info.get('font_size', 8.91)
|
|
has_link = info.get('has_link', False)
|
|
alignment = info.get('alignment', 'left')
|
|
|
|
# Determine classes and inline styles needed
|
|
classes = []
|
|
if font_style == 'italic':
|
|
classes.append('italic')
|
|
if font_weight >= 700:
|
|
classes.append('bold')
|
|
if has_link:
|
|
classes.append('link')
|
|
|
|
# Add font size as inline style if it differs significantly from base (8.91pt)
|
|
font_size_style = ''
|
|
if abs(font_size - 8.91) > 1.0: # More than 1pt difference
|
|
# Convert to relative em size (base is 8pt in CSS)
|
|
em_size = font_size / 8.0
|
|
font_size_style = f'font-size: {em_size:.2f}em'
|
|
|
|
# If classes changed, close previous span and open new one
|
|
if classes != current_span_classes:
|
|
if current_span_classes:
|
|
chapter_contents[current_chapter_num].append('</span>')
|
|
if classes:
|
|
class_attr = f' class="{" ".join(classes)}"'
|
|
style_attr = f' style="{font_size_style}"' if font_size_style else ''
|
|
chapter_contents[current_chapter_num].append(f'<span{class_attr}{style_attr}>')
|
|
elif font_size_style:
|
|
# Font size change without class changes
|
|
chapter_contents[current_chapter_num].append(f'<span style="{font_size_style}">')
|
|
current_span_classes = classes
|
|
|
|
# Add the character
|
|
chapter_contents[current_chapter_num].append(html.escape(char))
|
|
|
|
# Check if this is a line break position
|
|
if info.get('line_break', False):
|
|
# Detect bullet point context to keep bullets with their text
|
|
is_current_bullet = char in ['•', '◦', '●']
|
|
|
|
prev_is_bullet = False
|
|
for look_back in range(1, min(5, idx + 1)):
|
|
prev_char = char_mapping.get(str(all_glyphs[idx - look_back]), {}).get("character", "")
|
|
if prev_char in ['•', '◦', '●']:
|
|
prev_is_bullet = True
|
|
break
|
|
elif prev_char != ' ':
|
|
break
|
|
|
|
if current_span_classes:
|
|
chapter_contents[current_chapter_num].append('</span>')
|
|
current_span_classes = []
|
|
|
|
# Suppress line breaks after bullets to keep them with their text
|
|
if is_current_bullet or prev_is_bullet:
|
|
consecutive_line_breaks = 0
|
|
else:
|
|
consecutive_line_breaks += 1
|
|
|
|
next_alignment = 'left'
|
|
if idx + 1 < len(all_glyphs):
|
|
next_info = line_info.get(idx + 1, {})
|
|
next_alignment = next_info.get('alignment', 'left')
|
|
|
|
classes = []
|
|
if consecutive_line_breaks >= 2:
|
|
classes.append('break')
|
|
consecutive_line_breaks = 0
|
|
if next_alignment in ['center', 'right', 'indent']:
|
|
classes.append(next_alignment)
|
|
|
|
class_str = f' class="{" ".join(classes)}"' if classes else ''
|
|
chapter_contents[current_chapter_num].append(f'</p>\n<p{class_str}>')
|
|
else:
|
|
consecutive_line_breaks = 0
|
|
|
|
# Create EPUB chapters
|
|
print("Creating EPUB chapters...")
|
|
for toc_ch in toc_chapters:
|
|
ch_num = toc_ch['chapter_num']
|
|
if ch_num in chapter_contents:
|
|
chapter_contents[ch_num].append('</p>')
|
|
|
|
chapter = epub.EpubHtml(
|
|
title=toc_ch['label'],
|
|
file_name=f'chap_{ch_num:03d}.xhtml',
|
|
lang=metadata.get('lang', 'en')
|
|
)
|
|
chapter.content = ''.join(chapter_contents[ch_num])
|
|
chapter.add_item(default_css)
|
|
book.add_item(chapter)
|
|
chapters.append(chapter)
|
|
|
|
# Define Table of Contents
|
|
book.toc = tuple(chapters)
|
|
|
|
# Add navigation files
|
|
book.add_item(epub.EpubNcx())
|
|
book.add_item(epub.EpubNav())
|
|
|
|
# Define spine
|
|
book.spine = ['nav'] + chapters
|
|
|
|
# Save EPUB
|
|
output_file = Path("decoded_book.epub")
|
|
epub.write_epub(str(output_file), book)
|
|
|
|
print(f"\nEPUB created successfully: {output_file}")
|
|
print(f"Total chapters: {len(chapters)}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|