from __future__ import annotations import logging import re import ssl from dataclasses import dataclass from datetime import datetime from typing import List, Optional import aiohttp import certifi from bs4 import BeautifulSoup logger = logging.getLogger(__name__) DRIVE_FOLDER_EMBED = ( "https://drive.google.com/embeddedfolderview?id={folder_id}#list" ) DRIVE_DOWNLOAD_URL = "https://drive.google.com/uc?export=download&id={file_id}" USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) def _drive_connector() -> aiohttp.TCPConnector: ssl_context = ssl.create_default_context(cafile=certifi.where()) return aiohttp.TCPConnector(ssl=ssl_context) @dataclass(frozen=True) class DriveScheduleFile: file_id: str name: str schedule_date: datetime class DriveScheduleSource: def __init__(self, folder_id: str): self.folder_id = folder_id self._files_cache: Optional[List[DriveScheduleFile]] = None @staticmethod def _parse_filename_date(name: str) -> Optional[datetime]: match = re.match( r"^(\d{2})\.(\d{2})\.(\d{2})\s+по\s+учащимся\.pdf$", name.strip(), re.IGNORECASE, ) if not match: return None day, month, year = map(int, match.groups()) return datetime(2000 + year, month, day) async def list_student_schedules(self, force_refresh: bool = False) -> List[DriveScheduleFile]: if self._files_cache is not None and not force_refresh: return self._files_cache url = DRIVE_FOLDER_EMBED.format(folder_id=self.folder_id) async with aiohttp.ClientSession( headers={"User-Agent": USER_AGENT}, connector=_drive_connector(), ) as session: async with session.get(url) as resp: resp.raise_for_status() html = await resp.text() soup = BeautifulSoup(html, "html.parser") files: List[DriveScheduleFile] = [] for entry in soup.select("div.flip-entry"): entry_id = entry.get("id", "") if not entry_id.startswith("entry-"): continue file_id = entry_id.removeprefix("entry-") title_el = entry.select_one(".flip-entry-title") if not title_el: continue name = title_el.get_text(strip=True) schedule_date = self._parse_filename_date(name) if schedule_date is None: continue files.append(DriveScheduleFile(file_id=file_id, name=name, schedule_date=schedule_date)) files.sort(key=lambda item: item.schedule_date) self._files_cache = files return files async def find_for_date(self, target: datetime) -> Optional[DriveScheduleFile]: files = await self.list_student_schedules() for item in reversed(files): if ( item.schedule_date.day == target.day and item.schedule_date.month == target.month and item.schedule_date.year == target.year ): return item return None async def download_pdf(self, file_id: str) -> bytes: url = DRIVE_DOWNLOAD_URL.format(file_id=file_id) async with aiohttp.ClientSession( headers={"User-Agent": USER_AGENT}, connector=_drive_connector(), ) as session: async with session.get(url) as resp: resp.raise_for_status() return await resp.read()