diff --git a/main.py b/main.py index 31f6cfc..9635482 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,81 @@ +import re +import tempfile +from pathlib import Path + +from requests import HTTPError, request + + +def parse_issuu_url(url: str) -> tuple[str, str]: + """Get Username and document_id from issuu url. + + returns: + username: str + document_id: str + """ + issuu_url_pattern = re.compile(r"https://issuu.com/([^\/]*)/docs/(.*)$") + if mtc := issuu_url_pattern.match(url): + username = mtc.group(1) + document_id = mtc.group(2) + + else: + raise ValueError("Issuu URL not Valid!") + + return username, document_id + + +def create_working_dir() -> Path: + """create a working directory. + + returns: + Path() to a temporary directory. + """ + working_dir = tempfile.mkdtemp(prefix="issuu2epub_") + return Path(working_dir) + + +def get_page_urls(username: str, document_id: str) -> list[str]: + """get a list of all pages.""" + json_url = f"https://reader3.isu.pub/{username}/{document_id}/reader3_4.json" + r = request("GET", json_url, timeout=(5, 5)) + if not r.ok: + raise HTTPError("Failed to download document information") + + document_data = r.json() + return [ + f"https://{page['imageUri']}" for page in document_data["document"]["pages"] + ] + + +def download_pages(page_urls: list[str], working_dir: Path) -> list[Path]: + """download all page images and return file paths.""" + page_paths = [] + + for url in page_urls: + filename = url.split("/")[-1] + path = Path(working_dir / filename) + page_paths.append(path) + + with request("GET", url=url, stream=True, timeout=(10, 10)) as r: + r.raise_for_status() + + with open(path, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + return page_paths + + def main() -> None: """main function.""" - pass + cwd = create_working_dir() + username, document_id = parse_issuu_url( + "https://issuu.com/bscyb1898/docs/yb_mag_nr._1_saison_2025_26" + ) + + urls = get_page_urls(username, document_id) + + print(download_pages(urls, cwd)) if __name__ == "__main__":