issuu2epub/main.py

import re
import tempfile
from pathlib import Path

from requests import HTTPError, request


def parse_issuu_url(url: str) -> tuple[str, str]:
    """Get Username and document_id from issuu url.

    returns:
        username: str
        document_id: str
    """
    issuu_url_pattern = re.compile(r"https://issuu.com/([^\/]*)/docs/(.*)$")
    if mtc := issuu_url_pattern.match(url):
        username = mtc.group(1)
        document_id = mtc.group(2)

    else:
        raise ValueError("Issuu URL not Valid!")

    return username, document_id


def create_working_dir() -> Path:
    """create a working directory.

    returns:
        Path() to a temporary directory.
    """
    working_dir = tempfile.mkdtemp(prefix="issuu2epub_")
    return Path(working_dir)


def get_page_urls(username: str, document_id: str) -> list[str]:
    """get a list of all pages."""
    json_url = f"https://reader3.isu.pub/{username}/{document_id}/reader3_4.json"
    r = request("GET", json_url, timeout=(5, 5))
    if not r.ok:
        raise HTTPError("Failed to download document information")

    document_data = r.json()
    return [
        f"https://{page['imageUri']}" for page in document_data["document"]["pages"]
    ]


def download_pages(page_urls: list[str], working_dir: Path) -> list[Path]:
    """download all page images and return file paths."""
    page_paths = []

    for url in page_urls:
        filename = url.split("/")[-1]
        path = Path(working_dir / filename)
        page_paths.append(path)

        with request("GET", url=url, stream=True, timeout=(10, 10)) as r:
            r.raise_for_status()

            with open(path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

    return page_paths


def main() -> None:
    """main function."""
    cwd = create_working_dir()
    username, document_id = parse_issuu_url(
        "https://issuu.com/bscyb1898/docs/yb_mag_nr._1_saison_2025_26"
    )

    urls = get_page_urls(username, document_id)

    print(download_pages(urls, cwd))


if __name__ == "__main__":
    main()