Work in progress.

2025-10-05 14:45:24 +02:00
parent 71e6c5dd48
commit 2435bd7888
1 changed files with 76 additions and 1 deletions
--- a/main.py
+++ b/main.py
@@ -1,6 +1,81 @@
+import re
+import tempfile
+from pathlib import Path
+
+from requests import HTTPError, request
+
+
+def parse_issuu_url(url: str) -> tuple[str, str]:
+    """Get Username and document_id from issuu url.
+
+    returns:
+        username: str
+        document_id: str
+    """
+    issuu_url_pattern = re.compile(r"https://issuu.com/([^\/]*)/docs/(.*)$")
+    if mtc := issuu_url_pattern.match(url):
+        username = mtc.group(1)
+        document_id = mtc.group(2)
+
+    else:
+        raise ValueError("Issuu URL not Valid!")
+
+    return username, document_id
+
+
+def create_working_dir() -> Path:
+    """create a working directory.
+
+    returns:
+        Path() to a temporary directory.
+    """
+    working_dir = tempfile.mkdtemp(prefix="issuu2epub_")
+    return Path(working_dir)
+
+
+def get_page_urls(username: str, document_id: str) -> list[str]:
+    """get a list of all pages."""
+    json_url = f"https://reader3.isu.pub/{username}/{document_id}/reader3_4.json"
+    r = request("GET", json_url, timeout=(5, 5))
+    if not r.ok:
+        raise HTTPError("Failed to download document information")
+
+    document_data = r.json()
+    return [
+        f"https://{page['imageUri']}" for page in document_data["document"]["pages"]
+    ]
+
+
+def download_pages(page_urls: list[str], working_dir: Path) -> list[Path]:
+    """download all page images and return file paths."""
+    page_paths = []
+
+    for url in page_urls:
+        filename = url.split("/")[-1]
+        path = Path(working_dir / filename)
+        page_paths.append(path)
+
+        with request("GET", url=url, stream=True, timeout=(10, 10)) as r:
+            r.raise_for_status()
+
+            with open(path, "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+
+    return page_paths
+
+
 def main() -> None:
    """main function."""
-    pass
+    cwd = create_working_dir()
+    username, document_id = parse_issuu_url(
+        "https://issuu.com/bscyb1898/docs/yb_mag_nr._1_saison_2025_26"
+    )
+
+    urls = get_page_urls(username, document_id)
+
+    print(download_pages(urls, cwd))


 if __name__ == "__main__":