myfirstprogram/services/schedule_service.py

from __future__ import annotations

import io
import logging
import re
from datetime import datetime, timedelta
from html import escape
from typing import List, Optional, Tuple

import fitz
from pypdf import PdfReader

from config import Config
from services.drive_schedule_source import DriveScheduleSource

logger = logging.getLogger(__name__)
BOUNDARY = r"[^0-9A-Za-zА-Яа-яЁё]"


class ScheduleService:
    def __init__(self):
        folder_id = getattr(Config, "SCHEDULE_DRIVE_FOLDER_ID", None) or (
            "1WhUFHGkS4qC_e84KRArF4ooXHJr8mL5T"
        )
        self.drive = DriveScheduleSource(folder_id)
        self._pdf_cache: dict[str, bytes] = {}

    def _resolve_target_date(self, day_offset: int = 0) -> datetime:
        target = datetime.now()
        if day_offset == 0:
            if target.hour >= 12:
                target += timedelta(days=1)
            if target.weekday() == 6:
                target += timedelta(days=1)
        else:
            target = target.replace(day=int(day_offset))
        return target.replace(hour=0, minute=0, second=0, microsecond=0)


    def _next_target_date(self, day_offset: int = 0) -> datetime:
        return (datetime.now() + timedelta(days=day_offset)).replace(hour=0, minute=0, second=0, microsecond=0)


    async def _load_pdf_for_date(
        self, day_offset: int = 0
    ) -> Tuple[Optional[bytes], Optional[str], int, int]:
        target = self._resolve_target_date(day_offset)
        day, month = target.day, target.month

        drive_file = await self.drive.find_for_date(target)
        if not drive_file:
            return None, None, day, month

        if drive_file.file_id not in self._pdf_cache:
            self._pdf_cache[drive_file.file_id] = await self.drive.download_pdf(
                drive_file.file_id
            )

        url = f"https://drive.google.com/file/d/{drive_file.file_id}/view"
        return self._pdf_cache[drive_file.file_id], url, day, month

    async def _load_pdf_for_watcher(
        self, day_offset: int = 1
    ) -> Tuple[Optional[bytes], Optional[str], int, int]:
        target = self._next_target_date(day_offset)
        day, month = target.day, target.month

        drive_file = await self.drive.find_for_date(target)
        if not drive_file:
            return None, None, day, month

        if drive_file.file_id not in self._pdf_cache:
            self._pdf_cache[drive_file.file_id] = await self.drive.download_pdf(
                drive_file.file_id
            )

        url = f"https://drive.google.com/file/d/{drive_file.file_id}/view"
        return self._pdf_cache[drive_file.file_id], url, day, month


    @staticmethod
    def exact_group_regex(group: str) -> re.Pattern:
        pattern = rf"(^|{BOUNDARY}){re.escape(group)}({BOUNDARY}|$)"
        return re.compile(pattern, re.IGNORECASE)

    @staticmethod
    def _extract_pdf_lines(pdf_bytes: bytes) -> List[str]:
        reader = PdfReader(io.BytesIO(pdf_bytes))
        lines: List[str] = []
        for page in reader.pages:
            text = page.extract_text() or ""
            for raw_line in text.splitlines():
                line = raw_line.strip()
                if line:
                    lines.append(line)
        return lines

    @staticmethod
    def _parse_group_lines(lines: List[str], group: str) -> List[str]:
        regex = ScheduleService.exact_group_regex(group)
        schedule_lines: List[str] = []
        found_group = False

        for line in lines:
            if not found_group:
                if regex.search(line):
                    found_group = True
                    schedule_lines.append(line)
            else:
                if "-----" in line or "+----" in line:
                    break
                schedule_lines.append(line)

        return schedule_lines

    @staticmethod
    def is_schedule_missing(text: str) -> bool:
        lowered = text.lower()
        return "не найдено" in lowered or "не опубликовано" in lowered

    @staticmethod
    def _format_schedule_html(day: int, schedule_lines: List[str]) -> str:
        body_lines = []
        for line in schedule_lines:
            formatted = line.replace("¦", "│").replace("  ", " ").strip()
            if formatted:
                body_lines.append(formatted)
        body = escape("\n".join(body_lines))
        return f"📅 Расписание для {day} числа:\n<pre>{body}</pre>"

    async def is_published_for(self, day_offset: int = 0) -> bool:
        target = self._next_target_date(day_offset)
        return await self.drive.find_for_date(target) is not None

    async def get_schedule(
        self, group: str, day_offset: int = 0
    ) -> Tuple[str, str, int, int]:
        pdf_bytes, url, day, month = await self._load_pdf_for_date(day_offset)

        folder_url = "https://drive.google.com/drive/folders/" + self.drive.folder_id

        if not pdf_bytes:
            result = (
                f"⚠️ Расписание на {day:02d}.{month:02d} ещё не опубликовано "
                f"в <a href=\"{folder_url}\">Google Drive</a>"
            )
            return result, folder_url, day, month

        schedule_lines = self._parse_group_lines(
            self._extract_pdf_lines(pdf_bytes), group
        )

        if not schedule_lines:
            result = f"⚠️ Расписание для группы {escape(group)} на {day} число не найдено"
        else:
            result = self._format_schedule_html(day, schedule_lines)

        return result, url or folder_url, day, month

    async def get_pschedule(
        self, group: str, day_offset: int = 0
    ) -> Tuple[Optional[bytes], str, int, int]:
        pdf_bytes, url, day, month = await self._load_pdf_for_date(day_offset)
        fallback_url = (
            url
            or "https://drive.google.com/drive/folders/" + self.drive.folder_id
        )

        if not pdf_bytes:
            return None, fallback_url, day, month

        try:
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            regex = self.exact_group_regex(group)

            for page in doc:
                line_items = []
                page_dict = page.get_text("dict")
                for block in page_dict.get("blocks", []):
                    if block.get("type") != 0:
                        continue
                    for line in block.get("lines", []):
                        text = "".join(span["text"] for span in line["spans"]).strip()
                        if text:
                            line_items.append((text, fitz.Rect(line["bbox"])))

                found_group = False
                rects: List[fitz.Rect] = []
                for text, bbox in line_items:
                    if not found_group:
                        if regex.search(text):
                            found_group = True
                            rects.append(bbox)
                    else:
                        if "-----" in text or "+----" in text:
                            break
                        rects.append(bbox)

                if not rects:
                    continue

                clip = rects[0]
                for rect in rects[1:]:
                    clip |= rect
                clip.x0 = max(0, clip.x0 - 10)
                clip.x1 = min(page.rect.width, clip.x1 + 150)
                clip.y0 = max(0, clip.y0 - 5)
                clip.y1 = min(page.rect.height, clip.y1 + 10)

                pixmap = page.get_pixmap(clip=clip, matrix=fitz.Matrix(2, 2))
                return pixmap.tobytes("png"), fallback_url, day, month

        except Exception as e:
            logger.error(f"Ошибка при получении расписания из PDF: {e}")

        return None, fallback_url, day, month