BookfusionPluginsResearch/BookfusionBackSync/sync_worker.py

__license__ = 'GPL v3'

import json
import logging
import http.client
import os
import tempfile
import time
import uuid
import urllib.request
import urllib.parse
import urllib.error
from datetime import datetime

from PyQt5.Qt import QThread, pyqtSignal

from calibre_plugins.bookfusionbacksync.config import prefs

_API_BASE = 'https://bookfusion.com/api'

_GET_HEADERS = {
    'Accept': 'application/json, application/*+json',
    'User-Agent': 'BookFusion/2.22.0 (Android 12; Xiaomi 2201117TG; arm64-v8a)',
}
_POST_HEADERS = {
    **_GET_HEADERS,
    'Content-Type': 'application/json',
}

_DEFAULT_FETCH_PAGE_SIZE = 1337
_DEFAULT_FETCH_TIMEOUT = 90
_SKIP_LOG_SAMPLE_LIMIT = 20
_DEFAULT_COMPLETED_THRESHOLD = 98.9


class SyncWorker(QThread):
    log_message = pyqtSignal(str)
    # fetch_progress(current, total): progressive estimate for initial network fetch
    fetch_progress = pyqtSignal(int, int)
    # progress(current, total): scanning/sync phase, mapped to percent in UI
    progress = pyqtSignal(int, int)
    status = pyqtSignal(str)
    finished = pyqtSignal(int, int)   # updated, skipped

    def __init__(self, db, library_path=None):
        QThread.__init__(self)
        self.db = db
        self.library_path = library_path
        self._stop = False
        self._log = logging.getLogger('bookfusionbacksync')

    def stop(self):
        self._stop = True

    def run(self):
        try:
            self._sync()
        except Exception as exc:
            self._log.exception('Fatal error in sync')
            self.status.emit(f'Error: {exc}')
            self.log_message.emit(f'Fatal error: {exc}')
            self.finished.emit(0, 0)

    # ── File logging ─────────────────────────────────────────────────────────

    def _setup_logging(self):
        base_path = self.library_path or getattr(self.db, 'library_path', None)
        if not base_path:
            base_path = tempfile.gettempdir()
        log_path = os.path.join(base_path, 'bookfusionbacksync.log')
        logger = logging.getLogger('bookfusionbacksync')
        logger.setLevel(logging.DEBUG)
        logger.handlers.clear()
        fh = logging.FileHandler(log_path, encoding='utf-8')
        fh.setFormatter(logging.Formatter(
            '%(asctime)s %(levelname)-8s %(message)s', '%Y-%m-%d %H:%M:%S'
        ))
        logger.addHandler(fh)
        logger.info('─' * 60)
        logger.info('Sync started')
        return logger

    # ── Main flow ────────────────────────────────────────────────────────────

    def _sync(self):
        self._log = self._setup_logging()

        device = prefs['device']
        if not device:
            device = uuid.uuid4().hex[:16]
            prefs['device'] = device
            self._log.info(f'Generated device ID: {device}')

        email    = prefs['email']
        password = prefs['password']
        column   = prefs['last_read_column']
        completed_column = prefs['completed_column']
        completed_threshold = _as_float(
            prefs['completed_threshold'],
            _DEFAULT_COMPLETED_THRESHOLD,
        )
        fetch_page_size = _as_int(
            prefs['fetch_page_size'],
            _DEFAULT_FETCH_PAGE_SIZE,
            min_value=10,
            max_value=50000,
        )
        fetch_timeout = _as_int(
            prefs['fetch_timeout'],
            _DEFAULT_FETCH_TIMEOUT,
            min_value=5,
            max_value=600,
        )
        full_skip_logs = bool(prefs['full_skip_logs'])
        self._log.info(
            f'Email: {email}  Date column: {column}  '
            f'Completed column: {completed_column or "(disabled)"}  '
            f'Completed threshold: {completed_threshold:.2f}%'
        )

        # 1. Authenticate
        self.status.emit('Authenticating…')
        try:
            token = self._login(device, email, password)
        except urllib.error.HTTPError as exc:
            raise RuntimeError(
                f'Login failed (HTTP {exc.code}) — check email and password in Settings'
            )
        self._emit_log(f'Authenticated as {email}')
        if self._stop:
            return self.finished.emit(0, 0)

        # Pre-scan local identifiers once. Used both for matching and
        # as an initial estimate for fetch progress total.
        self.status.emit('Scanning Calibre library…')
        calibre_books = self._calibre_books_with_bf_id()
        self._emit_log(
            f'{len(calibre_books)} Calibre books have a BookFusion identifier'
        )
        expected_fetch_total = max(len(calibre_books), 1)

        # 2. Fetch all BookFusion books (paginated).
        self.status.emit('Fetching BookFusion library…')
        self._log.info(
            f'Fetch settings: per_page={fetch_page_size}, timeout={fetch_timeout}s'
        )
        bf_books = self._fetch_all_books(
            device,
            token,
            per_page=fetch_page_size,
            timeout=fetch_timeout,
            expected_total=expected_fetch_total,
        )
        self._emit_log(f'Fetched {len(bf_books)} books from BookFusion')
        if self._stop:
            return self.finished.emit(0, 0)

        # Build lookups for both id spaces used by BookFusion APIs:
        # - BookV3.id (user library record id)
        # - BookV3.book_id (global uploaded/catalog id, used by official Calibre plugin)
        bf_map_by_id = {}
        bf_map_by_book_id = {}
        bf_completed_by_id = {}
        bf_completed_by_book_id = {}
        bf_all_ids = set()
        bf_all_book_ids = set()
        invalid_date_count = 0
        invalid_percentage_count = 0
        for book in bf_books:
            bf_id = _id_key(book.get('id'))
            bf_book_id = _id_key(book.get('book_id'))
            if bf_id:
                bf_all_ids.add(bf_id)
            if bf_book_id:
                bf_all_book_ids.add(bf_book_id)

            date_str = book.get('last_read_at')
            source = 'last_read_at'
            if not date_str:
                rp = book.get('reading_position') or {}
                date_str = rp.get('updated_at')
                source = 'reading_position.updated_at'
            if date_str:
                dt = _parse_iso(date_str)
                if dt is None:
                    invalid_date_count += 1
                    continue
                if bf_id:
                    bf_map_by_id[bf_id] = (dt, source)
                if bf_book_id:
                    bf_map_by_book_id[bf_book_id] = (dt, source)

            rp = book.get('reading_position') or {}
            pct = _as_float(rp.get('percentage'))
            if pct is None:
                if rp.get('percentage') is not None:
                    invalid_percentage_count += 1
                continue
            if pct >= completed_threshold:
                if bf_id:
                    bf_completed_by_id[bf_id] = pct
                if bf_book_id:
                    bf_completed_by_book_id[bf_book_id] = pct

        self._emit_log(
            f'BookFusion date map: by book_id={len(bf_map_by_book_id)}, by id={len(bf_map_by_id)}, '
            f'invalid_dates={invalid_date_count}'
        )
        self._emit_log(
            f'BookFusion completed map (threshold>={completed_threshold:.2f}%): '
            f'by book_id={len(bf_completed_by_book_id)}, by id={len(bf_completed_by_id)}, '
            f'invalid_percentage={invalid_percentage_count}'
        )
        self._emit_log(
            f'BookFusion ids observed: book_id={len(bf_all_book_ids)}, id={len(bf_all_ids)}'
        )

        # 3. Match/write phase (progress bar switches to determinate mode)
        self.status.emit('Scanning Calibre library…')

        def _calibre_sort_key(item):
            _, bf_id = item
            entry = bf_map_by_book_id.get(bf_id) or bf_map_by_id.get(bf_id)
            if entry is None:
                return (1, 0.0)
            dt, _ = entry
            return (0, -dt.timestamp())

        calibre_books.sort(key=_calibre_sort_key)
        self._log.info('Calibre processing order: newest read dates first; books without dates last')

        total = len(calibre_books)
        date_updates = {}
        completed_updates = {}
        skipped = 0
        date_matched_by_book_id = 0
        date_matched_by_id = 0
        completed_matched_by_book_id = 0
        completed_matched_by_id = 0
        skipped_no_date = 0
        skipped_not_in_api = 0
        skip_no_date_samples = 0
        skip_not_in_api_samples = 0

        last_scan_percent = -1
        for i, (cal_id, bf_id) in enumerate(calibre_books):
            if self._stop:
                break
            if total > 0:
                scan_percent = int(i * 100 / total)
                if scan_percent != last_scan_percent:
                    last_scan_percent = scan_percent
                    self.progress.emit(i, total)

            date_entry = bf_map_by_book_id.get(bf_id)
            date_match_via = 'book_id'
            if date_entry is None:
                date_entry = bf_map_by_id.get(bf_id)
                date_match_via = 'id'

            completed_pct = None
            completed_match_via = 'book_id'
            if completed_column:
                completed_pct = bf_completed_by_book_id.get(bf_id)
                if completed_pct is None:
                    completed_pct = bf_completed_by_id.get(bf_id)
                    completed_match_via = 'id'

            if date_entry is None and completed_pct is None:
                skipped += 1
                if bf_id in bf_all_book_ids or bf_id in bf_all_ids:
                    skipped_no_date += 1
                    if full_skip_logs or skip_no_date_samples < _SKIP_LOG_SAMPLE_LIMIT:
                        title = self._book_title(cal_id)
                        self._log.debug(
                            f'SKIP  {title!r} — found in BookFusion but has no parseable read date '
                            f'(bf_id={bf_id})'
                        )
                        skip_no_date_samples += 1
                else:
                    skipped_not_in_api += 1
                    if full_skip_logs or skip_not_in_api_samples < _SKIP_LOG_SAMPLE_LIMIT:
                        title = self._book_title(cal_id)
                        self._log.debug(
                            f'SKIP  {title!r} — not in BookFusion library (bf_id={bf_id})'
                        )
                        skip_not_in_api_samples += 1
                continue

            title = self._book_title(cal_id)
            if date_entry is not None:
                dt, source = date_entry
                date_updates[cal_id] = dt
                if date_match_via == 'book_id':
                    date_matched_by_book_id += 1
                else:
                    date_matched_by_id += 1
                self._emit_log(
                    f'OK    {title}  →  {dt.strftime("%Y-%m-%d")}  '
                    f'(from {source}, match={date_match_via})'
                )

            if completed_pct is not None:
                completed_updates[cal_id] = True
                if completed_match_via == 'book_id':
                    completed_matched_by_book_id += 1
                else:
                    completed_matched_by_id += 1
                self._emit_log(
                    f'OK    {title}  →  Completed=True  '
                    f'(percentage={completed_pct:.3f}, threshold={completed_threshold:.2f}, '
                    f'match={completed_match_via})'
                )

        if self._stop:
            self.log_message.emit('Sync cancelled.')
            self._log.info('Sync cancelled by user')
            return self.finished.emit(len(date_updates), skipped)

        # 4. Write all updates to Calibre in one call
        if date_updates:
            self.status.emit(f'Writing {len(date_updates)} dates to Calibre…')
            self._log.info(f'Writing {len(date_updates)} dates to column {column!r}')
            self.db.set_field(column, date_updates)
            self._emit_log(f'Applied {len(date_updates)} date updates to {column}')

        if completed_column and completed_updates:
            self.status.emit(f'Writing {len(completed_updates)} completed flags to Calibre…')
            self._log.info(
                f'Writing {len(completed_updates)} completed flags to column {completed_column!r}'
            )
            self.db.set_field(completed_column, completed_updates)
            self._emit_log(
                f'Applied {len(completed_updates)} completed flags to {completed_column}'
            )

        self._log.info(
            'Match stats — '
            f'date_updates={len(date_updates)}, '
            f'date_match_book_id={date_matched_by_book_id}, date_match_id={date_matched_by_id}, '
            f'completed_updates={len(completed_updates)}, '
            f'completed_match_book_id={completed_matched_by_book_id}, '
            f'completed_match_id={completed_matched_by_id}, '
            f'skipped_no_date={skipped_no_date}, skipped_not_in_api={skipped_not_in_api}, '
            f'logged_skip_lines={skip_no_date_samples + skip_not_in_api_samples}, '
            f'full_skip_logs={full_skip_logs}'
        )

        self._log.info(f'Sync finished — {len(date_updates)} updated, {skipped} skipped')
        self.progress.emit(total, total)
        self.finished.emit(len(date_updates), skipped)

    # ── HTTP with exponential back-off ───────────────────────────────────────

    def _fetch_json(self, url, post_body=None, timeout=20):
        """GET or POST, returning parsed JSON.

        Retries up to 3 times on network/timeout errors with delays 2 s → 4 s.
        Raises urllib.error.HTTPError immediately (4xx/5xx are not transient).
        """
        headers = _POST_HEADERS if post_body is not None else _GET_HEADERS
        retry_delays = [2, 4]
        last_exc = None

        for attempt in range(3):
            if self._stop:
                raise RuntimeError('Sync stopped')
            method = 'POST' if post_body is not None else 'GET'
            self._log.debug(f'{method} {url}  (attempt {attempt + 1}/3)')
            try:
                req = urllib.request.Request(url, data=post_body, headers=headers)
                with urllib.request.urlopen(req, timeout=timeout) as resp:
                    data = resp.read()
                    self._log.debug(f'  → {resp.status}  {len(data)} bytes')
                    return json.loads(data)
            except urllib.error.HTTPError:
                raise   # auth failures, 404, etc. — no retry
            except (urllib.error.URLError, OSError, TimeoutError, http.client.IncompleteRead) as exc:
                last_exc = exc
                self._log.warning(f'  Attempt {attempt + 1}/3 failed: {exc}')
                if attempt < 2:
                    wait = retry_delays[attempt]
                    msg = f'Network error — retrying in {wait}s… ({exc})'
                    self.log_message.emit(msg)
                    self._log.info(f'  Waiting {wait}s before retry')
                    time.sleep(wait)

        raise RuntimeError(f'Failed after 3 attempts: {last_exc}')

    # ── Auth ─────────────────────────────────────────────────────────────────

    def _login(self, device, email, password):
        body = json.dumps(
            {'device': device, 'login': email, 'password': password}
        ).encode('utf-8')
        data = self._fetch_json(f'{_API_BASE}/v1/auth.json', post_body=body)
        return data['token']

    # ── BookFusion library fetch ──────────────────────────────────────────────

    def _fetch_all_books(self, device, token, per_page, timeout, expected_total):
        all_books = []
        page = 1
        pages_fetched = 0
        total_estimate = max(expected_total, 1)
        self.fetch_progress.emit(0, total_estimate)
        while not self._stop:
            params = urllib.parse.urlencode({
                'device': device,
                'token': token,
                'page': page,
                'per_page': per_page,
            })
            page_data = self._fetch_json(
                f'{_API_BASE}/v3/library/books.json?{params}',
                timeout=timeout,
            )
            if not page_data:
                self.fetch_progress.emit(total_estimate, total_estimate)
                break
            all_books.extend(page_data)
            pages_fetched += 1
            fetched = len(all_books)
            if fetched > total_estimate:
                total_estimate = fetched
            if len(page_data) == per_page and fetched >= total_estimate:
                total_estimate = fetched + per_page
            self._log.debug(
                f'Page {page}: {len(page_data)} books  (running total: {len(all_books)})'
            )
            self.fetch_progress.emit(min(fetched, total_estimate), total_estimate)
            self.status.emit(f'Fetching BookFusion library… {len(all_books)} books')
            if len(page_data) < per_page:
                self.fetch_progress.emit(total_estimate, total_estimate)
                break
            page += 1
        return all_books

    # ── Calibre helpers ──────────────────────────────────────────────────────

    def _calibre_books_with_bf_id(self):
        result = []
        for cal_id in self.db.all_book_ids():
            ids = self.db.field_for('identifiers', cal_id) or {}
            bf_id = ids.get('bookfusion')
            if bf_id:
                result.append((cal_id, str(bf_id)))
        return result

    def _book_title(self, cal_id):
        return self.db.field_for('title', cal_id) or f'Book #{cal_id}'

    # ── Helpers ───────────────────────────────────────────────────────────────

    def _emit_log(self, msg, level='info'):
        self.log_message.emit(msg)
        getattr(self._log, level)(msg)


# ── Utility ──────────────────────────────────────────────────────────────────

def _parse_iso(s):
    """Parse ISO 8601 string (Z or +00:00 suffix) into a tz-aware datetime."""
    try:
        return datetime.fromisoformat(s.replace('Z', '+00:00'))
    except (ValueError, AttributeError):
        return None


def _id_key(value):
    if value is None:
        return ''
    return str(value)


def _as_float(value, default=None):
    try:
        if value is None:
            return default
        return float(value)
    except (TypeError, ValueError):
        return default


def _as_int(value, default, min_value=None, max_value=None):
    try:
        if value is None:
            return default
        result = int(value)
    except (TypeError, ValueError):
        return default

    if min_value is not None and result < min_value:
        return min_value
    if max_value is not None and result > max_value:
        return max_value
    return result