import hashlib
import json
import os
import re
from dataclasses import dataclass, asdict
from functools import lru_cache

import requests
import six
from PIL import Image
from bs4 import BeautifulSoup, NavigableString
from markdownify import MarkdownConverter, re_whitespace


@lru_cache
def _hash_html(html: str):
    return hashlib.md5(html.encode("utf-8")).hexdigest()


def get_image_name(html: str, div_idx: int):
    html_hash = _hash_html(html)
    return f"{html_hash}_{div_idx}_img.webp"


def fix_raw(html: str):
    def replace_group(match):
        numbers = re.findall(r"\d+", match.group(0))
        return "[" + ",".join(numbers) + "]"

    result = re.sub(r"(?:<BBOX\d+>){4}", replace_group, html)
    return result


def extract_images(html: str, chunks: dict, image: Image.Image):
    images = {}
    div_idx = 0
    for idx, chunk in enumerate(chunks):
        div_idx += 1
        if chunk["label"] in ["Image", "Figure"]:
            img = chunk["content"].find("img")
            if not img:
                continue
            bbox = chunk["bbox"]
            try:
                block_image = image.crop(bbox)
            except ValueError:
                # Happens when bbox coordinates are invalid
                continue
            img_name = get_image_name(html, div_idx)
            images[img_name] = block_image
    return images


def parse_html(
    html: str, include_headers_footers: bool = False, include_images: bool = True
):
    soup = BeautifulSoup(html, "html.parser")
    top_level_divs = soup.find_all("div", recursive=False)
    out_html = ""
    image_idx = 0
    div_idx = 0
    for div in top_level_divs:
        div_idx += 1
        label = div.get("data-label")

        # Skip headers and footers if not included
        if label and not include_headers_footers:
            if label in ["Page-Header", "Page-Footer"]:
                continue
        if label and not include_images:
            if label in ["Image", "Figure"]:
                continue

        if label in ["Image", "Figure"]:
            img = div.find("img")
            img_src = get_image_name(html, div_idx)

            # If no tag, add one in
            if img:
                img["src"] = img_src
                image_idx += 1
            else:
                img = BeautifulSoup(f"<img src='{img_src}'/>", "html.parser")
                div.append(img)

        # Wrap text content in <p> tags if no inner HTML tags exist
        if label in ["Text"] and not re.search(
            "<.+>", str(div.decode_contents()).strip()
        ):
            # Add inner p tags if missing for text blocks
            text_content = str(div.decode_contents()).strip()
            text_content = f"<p>{text_content}</p>"
            div.clear()
            div.append(BeautifulSoup(text_content, "html.parser"))

        content = str(div.decode_contents())
        out_html += content
    return out_html


def escape_dollars(text):
    return text.replace("$", r"\$")


def get_formatted_table_text(element):
    text = []
    for content in element.contents:
        if content is None:
            continue

        if isinstance(content, NavigableString):
            stripped = content.strip()
            if stripped:
                text.append(escape_dollars(stripped))
        elif content.name == "br":
            text.append("<br>")
        elif content.name == "math":
            text.append("$" + content.text + "$")
        else:
            content_str = escape_dollars(str(content))
            text.append(content_str)

    full_text = ""
    for i, t in enumerate(text):
        if t == "<br>":
            full_text += t
        elif i > 0 and text[i - 1] != "<br>":
            full_text += " " + t
        else:
            full_text += t
    return full_text


class Markdownify(MarkdownConverter):
    def __init__(
        self,
        inline_math_delimiters,
        block_math_delimiters,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.inline_math_delimiters = inline_math_delimiters
        self.block_math_delimiters = block_math_delimiters

    def convert_math(self, el, text, parent_tags):
        block = el.has_attr("display") and el["display"] == "block"
        if block:
            return (
                "\n"
                + self.block_math_delimiters[0]
                + text.strip()
                + self.block_math_delimiters[1]
                + "\n"
            )
        else:
            return (
                " "
                + self.inline_math_delimiters[0]
                + text.strip()
                + self.inline_math_delimiters[1]
                + " "
            )

    def convert_table(self, el, text, parent_tags):
        return "\n\n" + str(el) + "\n\n"

    def convert_a(self, el, text, parent_tags):
        text = self.escape(text)
        # Escape brackets and parentheses in text
        text = re.sub(r"([\[\]()])", r"\\\1", text)
        return super().convert_a(el, text, parent_tags)

    def escape(self, text, parent_tags=None):
        text = super().escape(text, parent_tags)
        if self.options["escape_dollars"]:
            text = text.replace("$", r"\$")
        return text

    def process_text(self, el, parent_tags=None):
        text = six.text_type(el) or ""

        # normalize whitespace if we're not inside a preformatted element
        if not el.find_parent("pre"):
            text = re_whitespace.sub(" ", text)

        # escape special characters if we're not inside a preformatted or code element
        if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):
            text = self.escape(text)

        # remove trailing whitespaces if any of the following condition is true:
        # - current text node is the last node in li
        # - current text node is followed by an embedded list
        if el.parent.name == "li" and (
            not el.next_sibling or el.next_sibling.name in ["ul", "ol"]
        ):
            text = text.rstrip()

        return text


def parse_markdown(
    html: str, include_headers_footers: bool = False, include_images: bool = True
):
    html = parse_html(html, include_headers_footers, include_images)

    md_cls = Markdownify(
        heading_style="ATX",
        bullets="-",
        escape_misc=False,
        escape_underscores=True,
        escape_asterisks=True,
        escape_dollars=True,
        sub_symbol="<sub>",
        sup_symbol="<sup>",
        inline_math_delimiters=("$", "$"),
        block_math_delimiters=("$$", "$$"),
    )
    try:
        markdown = md_cls.convert(html)
    except Exception as e:
        print(f"Error converting HTML to Markdown: {e}")
        markdown = ""
    return markdown.strip()


@dataclass
class LayoutBlock:
    bbox: list[int]
    label: str
    content: str


def parse_layout(html: str, image: Image.Image):
    soup = BeautifulSoup(html, "html.parser")
    top_level_divs = soup.find_all("div", recursive=False)
    width, height = image.size
    width_scaler = width / 1024
    height_scaler = height / 1024
    layout_blocks = []
    for div in top_level_divs:
        bbox = div.get("data-bbox")

        try:
            bbox = json.loads(bbox)
        except Exception:
            bbox = [0, 0, 1, 1]

        bbox = list(map(int, bbox))
        # Normalize bbox
        bbox = [
            max(0, int(bbox[0] * width_scaler)),
            max(0, int(bbox[1] * height_scaler)),
            min(int(bbox[2] * width_scaler), width),
            min(int(bbox[3] * height_scaler), height),
        ]
        label = div.get("data-label", "block")
        content = str(div.decode_contents())
        layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
    return layout_blocks


def parse_chunks(html: str, image: Image.Image):
    layout = parse_layout(html, image)
    chunks = [asdict(block) for block in layout]
    return chunks


def translate_to_korean(english_text: str) -> str:
    """Translate English text to Korean using Ollama API"""
    try:
        ollama_url = os.environ.get('OLLAMA_PROXY_SERVER', 'http://localhost:11434')
        response = requests.post(
            f"{ollama_url}/api/generate",
            json={
                "model": "hamonize:latest",
                "prompt": f"다음 영어 이미지 설명을 자연스러운 한국어로 번역해주세요. 번역 결과만 출력하고 다른 설명은 하지 마세요.\n\n영어: {english_text}\n\n한국어:",
                "stream": False,
            },
            timeout=30,
        )

        if response.status_code == 200:
            korean_text = response.json().get('response', '').strip()
            print(f"✅ 번역 완료: \"{english_text[:50]}...\" → \"{korean_text[:50]}...\"")
            return korean_text
        else:
            print(f"⚠️ Ollama API 오류: {response.status_code}")
            return english_text
    except Exception as e:
        print(f"⚠️ 번역 중 오류 발생: {e}")
        return english_text


def translate_image_alts_to_korean(content: str) -> str:
    """Translate all English alt attributes in HTML and Markdown to Korean"""
    import re

    # HTML img 태그의 alt 속성 추출
    html_img_regex = r'<img[^>]+alt="([^"]+)"[^>]*>'
    html_matches = list(re.finditer(html_img_regex, content))

    # Markdown 이미지 형식 추출 ![alt](src)
    md_img_regex = r'!\[([^\]]+)\]\([^)]+\)'
    md_matches = list(re.finditer(md_img_regex, content))

    total_matches = len(html_matches) + len(md_matches)

    if total_matches == 0:
        print("⚠️ 번역할 이미지 alt 속성이 없습니다.")
        return content

    print(f"🔄 {total_matches}개의 이미지 설명을 한국어로 번역 중...")

    translated_content = content

    # HTML 형식 번역
    for match in html_matches:
        full_img_tag = match.group(0)
        english_alt = match.group(1)

        # 이미 한글이 포함되어 있으면 건너뛰기
        if re.search(r'[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]', english_alt):
            print(f"⏭️ 건너뜀 (이미 한글 포함): \"{english_alt[:50]}...\"")
            continue

        # 영어 설명을 한국어로 번역
        korean_alt = translate_to_korean(english_alt)

        # HTML에서 영어 alt를 한국어 alt로 교체
        new_img_tag = full_img_tag.replace(f'alt="{english_alt}"', f'alt="{korean_alt}"')
        translated_content = translated_content.replace(full_img_tag, new_img_tag)

    # Markdown 형식 번역
    for match in md_matches:
        full_md_img = match.group(0)
        english_alt = match.group(1)

        # 이미 한글이 포함되어 있으면 건너뛰기
        if re.search(r'[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]', english_alt):
            print(f"⏭️ 건너뜀 (이미 한글 포함): \"{english_alt[:50]}...\"")
            continue

        # 영어 설명을 한국어로 번역
        korean_alt = translate_to_korean(english_alt)

        # Markdown에서 영어 alt를 한국어 alt로 교체
        new_md_img = full_md_img.replace(f'[{english_alt}]', f'[{korean_alt}]')
        translated_content = translated_content.replace(full_md_img, new_md_img)

    print("✅ 모든 이미지 설명 번역 완료")
    return translated_content
