Files
myfirstprogram/services/drive_schedule_source.py

110 lines
3.5 KiB
Python

from __future__ import annotations
import logging
import re
import ssl
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional
import aiohttp
import certifi
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
DRIVE_FOLDER_EMBED = (
"https://drive.google.com/embeddedfolderview?id={folder_id}#list"
)
DRIVE_DOWNLOAD_URL = "https://drive.google.com/uc?export=download&id={file_id}"
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
def _drive_connector() -> aiohttp.TCPConnector:
ssl_context = ssl.create_default_context(cafile=certifi.where())
return aiohttp.TCPConnector(ssl=ssl_context)
@dataclass(frozen=True)
class DriveScheduleFile:
file_id: str
name: str
schedule_date: datetime
class DriveScheduleSource:
def __init__(self, folder_id: str):
self.folder_id = folder_id
self._files_cache: Optional[List[DriveScheduleFile]] = None
@staticmethod
def _parse_filename_date(name: str) -> Optional[datetime]:
match = re.match(
r"^(\d{2})\.(\d{2})\.(\d{2})\s+по\s+учащимся\.pdf$",
name.strip(),
re.IGNORECASE,
)
if not match:
return None
day, month, year = map(int, match.groups())
return datetime(2000 + year, month, day)
async def list_student_schedules(self, force_refresh: bool = False) -> List[DriveScheduleFile]:
if self._files_cache is not None and not force_refresh:
return self._files_cache
url = DRIVE_FOLDER_EMBED.format(folder_id=self.folder_id)
async with aiohttp.ClientSession(
headers={"User-Agent": USER_AGENT},
connector=_drive_connector(),
) as session:
async with session.get(url) as resp:
resp.raise_for_status()
html = await resp.text()
soup = BeautifulSoup(html, "html.parser")
files: List[DriveScheduleFile] = []
for entry in soup.select("div.flip-entry"):
entry_id = entry.get("id", "")
if not entry_id.startswith("entry-"):
continue
file_id = entry_id.removeprefix("entry-")
title_el = entry.select_one(".flip-entry-title")
if not title_el:
continue
name = title_el.get_text(strip=True)
schedule_date = self._parse_filename_date(name)
if schedule_date is None:
continue
files.append(DriveScheduleFile(file_id=file_id, name=name, schedule_date=schedule_date))
files.sort(key=lambda item: item.schedule_date)
self._files_cache = files
return files
async def find_for_date(self, target: datetime) -> Optional[DriveScheduleFile]:
files = await self.list_student_schedules()
for item in reversed(files):
if (
item.schedule_date.day == target.day
and item.schedule_date.month == target.month
and item.schedule_date.year == target.year
):
return item
return None
async def download_pdf(self, file_id: str) -> bytes:
url = DRIVE_DOWNLOAD_URL.format(file_id=file_id)
async with aiohttp.ClientSession(
headers={"User-Agent": USER_AGENT},
connector=_drive_connector(),
) as session:
async with session.get(url) as resp:
resp.raise_for_status()
return await resp.read()