Page de la MPSI du [DATA EXPUNGED]

#!/usr/bin/env python3.9 from enum import Enum import re import sys from time import sleep from rich import inspect, print, traceback from bs4 import BeautifulSoup from typing import Callable, NamedTuple, Optional import requests from urllib3.util.url import Url, parse_url, LocationParseError from urllib.parse import urljoin from pathlib import Path from typing import Union traceback.install() SUPERSECRET_DOMAIN = input("Enter the super secret domain avec les polys de maths: ") class WithinWhat(Enum): internet = 0 domain = 1 path = 2 # FIXME def _get_absolute_href(parent: Url, child: str) -> Url: """ C'est dégueleulasse mon dieux. """ child_raw = child parent_raw = parent.url child = parse_url(child) if child_raw.startswith("../") or child_raw.startswith("./"): return parse_url(urljoin(parent_raw, child_raw)) if "://" not in child_raw: if not parent_raw.endswith("/"): parent_raw = "/".join(parent_raw.split("/")[:-1]) child = parent return parse_url(parent.url + "/" + child_raw) if child.scheme: return child if child.host and not child.scheme: return Url( scheme=parent.scheme, host=parent.host, path=((parent.path or "") + "/" + child.host + "/" + (child.path or "")) .strip("/") .replace("//", "/"), query=child.query, ) if child.host == parent.host and not child.scheme: return Url( scheme=(child.scheme or parent.scheme), host=child.host, path=((parent.path or "") + "/" + (child.path or "")) .strip("/") .replace("//", "/"), query=child.query, ) return urljoin(parent.url, child.url) def _get_links(entrypoint: str, content: str, within: WithinWhat) -> list[str]: """ Get all hrefs from tags in the given page that stay within `within` - entrypoint: The page's URL. Used to resolve `within` preferences - content: The page's content. """ root = BeautifulSoup(content, features="html.parser") entrypoint = parse_url(entrypoint) for link in root("a"): if not link.attrs.get("href"): continue try: href = _get_absolute_href(entrypoint, link.attrs["href"]) except LocationParseError: continue if href is None: continue is_within = { WithinWhat.domain: href.host == entrypoint.host, WithinWhat.path: Path(href.path).parent == Path(entrypoint.path).parent, WithinWhat.internet: True, }[within] if is_within: yield href.url def crawl( entrypoint: str, do: Optional[Callable[[requests.Response, Url], Union[None, str]]] = None, within: WithinWhat = WithinWhat.domain, politeness: int = 0, pattern: "re.Pattern" = None, avoid: "re.Pattern" = None, only: "re.Pattern" = None, max_depth: Optional[int] = None, _depth=0, _seen=None, ) -> set[str]: """ Return a set of all links mentionned in all pages linking from `entrypoint`, recursively. - do: Run a function on each URL that will be returned. The advantage is that you have access to the `requests.Response` object, no need to hit the network again. - within: Restrict crawling to URLs within a certain range: - `WithinWhat.internet`: No restrictions - `WithinWhat.path`: Only crawl children paths of `entrypoint` - `WithinWhat.domain`: Only crawl URLs that have the same host (or "domain") as `entrypoint` - politeness: How much seconds to wait between each request - pattern: Return only links matching `pattern` - avoid: Avoid crawling links from pages matching `avoid` - only: Only crawl links from pages matching `only` - verbose: Output verbose logging about the URLs being: - crawled (i.e. scanned for more links) - stashed (i.e. added to the set of returned URLs) - max_depth: Do not recurse deeper than this. Note that Python's own recursion limit will supersede this one. _depth & _seen are internal attributes used when calling the function recursively """ seen = _seen or set() p = lambda text: print(" " * _depth + text) try: parsed_entrypoint = parse_url(entrypoint) except LocationParseError: if _depth > 0: return seen else: raise ValueError(f"Couldn't parse URL {entrypoint!r}") if (avoid and avoid.search(entrypoint)) or (only and not only.search(entrypoint)): p(f"[bold yellow]←[/] Avoiding {entrypoint}") return seen if _depth > 0 and politeness: p(f"[bold dim]…[/] Sleeping for {politeness} s") sleep(politeness) resp = requests.get(entrypoint) if resp.status_code >= 400: p(f"[bold red]⚠[/] Error {resp.status_code} while requesting {entrypoint}") return seen if ( pattern and not pattern.search(entrypoint) and "html" not in resp.headers.get("Content-Type") ): p(f"[bold dim]…[/] [dim]Ignoring {entrypoint}") return seen if not pattern or pattern.search(entrypoint): if not do: p(f"[bold cyan]→[/] Stashing {entrypoint}") seen.add(entrypoint) if do: p( f"[bold green]>[/] Executing [green]{do.__qualname__}[/green] on {entrypoint}" ) msg = do(resp, parsed_entrypoint) for line in msg.splitlines(): p("[dim]│[/] " + line) return seen does_not_exceed_max_depth = _depth <= (max_depth or sys.getrecursionlimit()) if "html" in resp.headers.get("Content-Type") and does_not_exceed_max_depth: p(f"[bold magenta]↘️[/] Crawling {entrypoint}") for url in _get_links(entrypoint, resp.text, within=within): if url in seen: continue seen |= crawl( entrypoint=url, do=do, avoid=avoid, only=only, within=within, pattern=pattern, politeness=politeness, _depth=_depth + 1, _seen=seen | {url}, ) # if seen == _seen: # p("[bold dim]…[/][dim] Found nothing of interest") return seen def save(resp: requests.Response, url: Url) -> str: msg = "" path = Path(".") try: week_index = int(re.search(r'\d+', url.path).group(0)) except AttributeError: return "[red]![/red] No number in file name, skipping" if 'QC' in url.path: path /= f"{week_index:02}-qc-sujet.pdf" else: path /= f"{week_index:02}-sujet.pdf" if path.exists() and path.suffix != ".pdf": return f"[bold dim]…[/] Skipping as file already exists" path.parent.mkdir(parents=True, exist_ok=True) if path.suffix in (".tex", ".ml", ".py"): path.write_text(resp.text) else: path.write_bytes(resp.content) msg += f"[bold blue]↓[/] Saved as [cyan]{path}[/]\n" if path.suffix == ".pdf": tex_source = requests.get(url.url.removesuffix(".pdf") + ".tex") if tex_source.status_code < 400: msg += f"[bold red]♥[/] Found a LaTeX source\n" (path.parent / "sources" / (path.with_suffix(".tex").name)).write_text(tex_source.text) # downloading all \inputs for dependency in re.finditer(r"\\input\{(.+)\}", tex_source.text): extracted_path = dependency.group(1).replace("Entetes", "entetes") dep_save_path = path.parent / '.headers' / 'kholles.tex' dep_url = f"http://{SUPERSECRET_DOMAIN}/" + str( (Path(url.path).parent / Path(extracted_path)).resolve().absolute() ) if ( (dep_source := requests.get(dep_url)).status_code < 400 ): dep_save_path_display = dep_save_path.resolve().relative_to(Path('.').resolve()) if not dep_save_path.exists(): dep_save_path.parent.mkdir(parents=True, exist_ok=True) dep_save_path.write_text(dep_source.text.replace(extracted_path, '../.headers/kholles.tex')) msg += f"[bold red]@[/] Saved a dependency as [cyan]{dep_save_path_display}[/]\n" else: msg += f"[bold red]@[/] Found a dependency as [cyan]{dep_save_path_display}[/]\n" return msg.strip() # TODO: parse this: # # # Page de la MPSI du [DATA EXPUNGED] # # # # # # # # as a redirect to http://{SUPERSECRET_DOMAIN}/maths/index.html try: crawl( f"http://{SUPERSECRET_DOMAIN}/maths/kholles/index.html", do=save, pattern=re.compile(r"\.(pdf|py|ml|tex|png|jpg)$"), only=re.compile(r"kholles/"), politeness=3, ) except KeyboardInterrupt: print("\n\n[bold red]×[/] [red]Cancelled")