#!/usr/bin/env python3
"""
HTML to Document Converter Service for ESG Reports
Converts HTML ESG analysis reports to PDF, DOCX, and HWP formats
"""

import os
import sys
import tempfile
from pathlib import Path
from bs4 import BeautifulSoup
import re
import base64
from io import BytesIO
from PIL import Image

# Add parent directory to path to import utils
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../..'))
from utils import PDFDocument, DOCXDocument, HWPDocument


class ESGHTMLConverter:
    """Convert HTML ESG reports to various document formats"""

    def __init__(self):
        """Initialize converter"""
        self.supported_formats = ['pdf', 'docx', 'hwp']

    def convert(self, html_content: str, output_format: str, output_path: str = None) -> str:
        """
        Convert HTML content to specified format

        Args:
            html_content: HTML content string
            output_format: Target format (pdf, docx, hwp)
            output_path: Optional output file path. If None, creates temp file.

        Returns:
            str: Path to the generated document
        """
        if output_format.lower() not in self.supported_formats:
            raise ValueError(f"Unsupported format: {output_format}. Supported: {self.supported_formats}")

        # Parse HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        # Generate output path if not provided
        if output_path is None:
            temp_dir = tempfile.gettempdir()
            output_path = os.path.join(temp_dir, f"esg_report.{output_format}")

        # Convert based on format
        if output_format.lower() == 'pdf':
            return self._convert_to_pdf(soup, output_path)
        elif output_format.lower() == 'docx':
            return self._convert_to_docx(soup, output_path)
        elif output_format.lower() == 'hwp':
            return self._convert_to_hwp(soup, output_path)

    def _extract_text_content(self, element):
        """Extract text content from HTML element, preserving some structure"""
        if not element:
            return ""

        # Get text and clean up whitespace
        text = element.get_text(separator=' ', strip=True)
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)
        return text

    def _extract_images(self, soup):
        """Extract all images from HTML including base64 encoded images"""
        images = []

        # Find all img tags
        img_tags = soup.find_all('img')
        print(f"[DEBUG] Found {len(img_tags)} img tags in HTML")

        for idx, img in enumerate(img_tags):
            src = img.get('src', '')
            print(f"[DEBUG] Processing img {idx}: src length = {len(src) if src else 0}")

            if src.startswith('data:image'):
                # Base64 encoded image
                try:
                    # Extract base64 data
                    header, base64_data = src.split(',', 1)
                    image_data = base64.b64decode(base64_data)

                    # Create PIL Image
                    image = Image.open(BytesIO(image_data))
                    original_width, original_height = image.size
                    print(f"[DEBUG] Decoded base64 image: {image.format} {original_width}x{original_height}")

                    # Keep high resolution - don't resize unless extremely large
                    # Frontend already generates high-res images (up to 1200px)
                    # Only resize if larger than 2400px to prevent memory issues
                    max_width = 2400
                    if original_width > max_width:
                        ratio = max_width / original_width
                        new_width = max_width
                        new_height = int(original_height * ratio)
                        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
                        print(f"[DEBUG] Resized extremely large image to: {new_width}x{new_height}")
                    else:
                        print(f"[DEBUG] Keeping original high-resolution: {original_width}x{original_height}")

                    # Save to temporary file with high quality (no optimization that reduces quality)
                    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
                    image.save(temp_file.name, 'PNG', optimize=False, compress_level=1)
                    temp_file.close()

                    print(f"[DEBUG] Saved optimized image to: {temp_file.name}")

                    images.append({
                        'path': temp_file.name,
                        'element': img,
                        'alt': img.get('alt', ''),
                        'temp': True,
                        'index': idx,
                        'width': image.width,
                        'height': image.height
                    })
                except Exception as e:
                    print(f"[ERROR] Error processing base64 image {idx}: {e}")
            elif src.startswith('http'):
                # URL image - download it
                try:
                    import requests
                    response = requests.get(src, timeout=10)
                    image = Image.open(BytesIO(response.content))

                    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
                    image.save(temp_file.name, 'PNG')
                    temp_file.close()

                    print(f"[DEBUG] Downloaded and saved image from URL to: {temp_file.name}")

                    images.append({
                        'path': temp_file.name,
                        'element': img,
                        'alt': img.get('alt', ''),
                        'temp': True,
                        'index': idx
                    })
                except Exception as e:
                    print(f"[ERROR] Error downloading image {idx} from {src}: {e}")

        print(f"[DEBUG] Successfully extracted {len(images)} images")
        return images

    def _process_canvas_elements(self, soup):
        """
        Replace canvas elements with placeholder text
        Note: Canvas elements (Chart.js charts) cannot be directly converted.
        They need to be rendered to images on the client side before conversion.
        """
        for canvas in soup.find_all('canvas'):
            # Get chart title or id
            chart_id = canvas.get('id', 'chart')

            # Create placeholder paragraph
            placeholder = soup.new_tag('p')
            placeholder.string = f"[차트: {chart_id}]"
            placeholder['style'] = 'color: #666; font-style: italic; text-align: center; padding: 20px; border: 1px dashed #ccc;'

            # Replace canvas with placeholder
            canvas.replace_with(placeholder)

    def _convert_to_pdf(self, soup: BeautifulSoup, output_path: str) -> str:
        """Convert HTML to PDF using PDFDocument"""
        pdf = PDFDocument()

        # Process canvas elements (Chart.js charts)
        self._process_canvas_elements(soup)

        # Extract images
        images = self._extract_images(soup)
        image_map = {img['element']: img['path'] for img in images}

        try:
            # Extract title
            title_elem = soup.find('h1')
            if title_elem:
                pdf.add_heading(self._extract_text_content(title_elem), level=1)

            # Extract meta information
            meta_section = soup.find('div', class_='document-meta')
            if meta_section:
                for p in meta_section.find_all('p'):
                    pdf.add_paragraph(self._extract_text_content(p))
                pdf.add_paragraph("")  # Add spacing

            # Extract main content sections - need to traverse in document order
            print(f"[DEBUG] Starting to process document sections for PDF")

            # Get all elements in order
            all_elements = soup.find_all(['h2', 'h3', 'p', 'ul', 'table', 'img'])
            print(f"[DEBUG] Found {len(all_elements)} total elements")

            img_count = 0
            for elem in all_elements:
                if elem.name == 'h2':
                    pdf.add_heading(self._extract_text_content(elem), level=2)
                elif elem.name == 'h3':
                    pdf.add_heading(self._extract_text_content(elem), level=3)
                elif elem.name == 'p':
                    text = self._extract_text_content(elem)
                    if text:  # Only add non-empty paragraphs
                        pdf.add_paragraph(text)
                elif elem.name == 'img':
                    # Add image if it was successfully extracted
                    img_count += 1
                    print(f"[DEBUG] Found img element #{img_count}, checking if in image_map...")
                    if elem in image_map:
                        image_path = image_map[elem]
                        print(f"[DEBUG] Adding image to PDF: {image_path}")
                        try:
                            if os.path.exists(image_path):
                                # Get image dimensions to determine appropriate width
                                img_obj = Image.open(image_path)
                                img_width, img_height = img_obj.size

                                # Calculate width in PDF units (ReportLab uses points, 1 inch = 72 points)
                                # PDF page width is typically ~500 points usable area
                                max_pdf_width = 480.0  # Leave margins

                                # High-res images: use 144 DPI instead of 96 DPI for better quality
                                # At 144 DPI: 1 pixel ≈ 0.5 points (displays smaller but keeps resolution)
                                # This means a 1200px image displays at 600 points = ~8.3 inches
                                dpi_factor = 0.5  # 144 DPI instead of 96 DPI
                                pdf_width_raw = img_width * dpi_factor

                                # If image is too wide, scale it down maintaining aspect ratio
                                if pdf_width_raw > max_pdf_width:
                                    scale_factor = max_pdf_width / pdf_width_raw
                                    calculated_width = max_pdf_width
                                    calculated_height = img_height * dpi_factor * scale_factor
                                else:
                                    calculated_width = pdf_width_raw
                                    calculated_height = img_height * dpi_factor

                                pdf.add_image(image_path, width=calculated_width, height=calculated_height)
                                print(f"[DEBUG] Successfully added image to PDF ({img_width}x{img_height}px -> {calculated_width:.1f}x{calculated_height:.1f}pt, aspect ratio preserved)")
                            else:
                                print(f"[ERROR] Image file not found: {image_path}")
                        except Exception as e:
                            print(f"[ERROR] Error adding image to PDF: {e}")
                            import traceback
                            traceback.print_exc()
                    else:
                        print(f"[DEBUG] Image element not found in image_map")
                elif elem.name == 'ul':
                    for li in elem.find_all('li'):
                        pdf.add_paragraph(f"• {self._extract_text_content(li)}")
                elif elem.name == 'table':
                    # Extract table data
                    rows = []
                    for tr in elem.find_all('tr'):
                        row = [self._extract_text_content(td) for td in tr.find_all(['th', 'td'])]
                        rows.append(row)
                    if rows:
                        pdf.add_table(rows)

            print(f"[DEBUG] Processed {img_count} img elements total")

            # Save PDF
            pdf.save(output_path)

        finally:
            # Clean up temporary image files
            for img in images:
                if img['temp'] and os.path.exists(img['path']):
                    try:
                        os.unlink(img['path'])
                    except:
                        pass

        return output_path

    def _convert_to_docx(self, soup: BeautifulSoup, output_path: str) -> str:
        """Convert HTML to DOCX using DOCXDocument"""
        docx = DOCXDocument()

        # Process canvas elements (Chart.js charts)
        self._process_canvas_elements(soup)

        # Extract images
        images = self._extract_images(soup)
        image_map = {img['element']: img['path'] for img in images}

        try:
            # Extract title
            title_elem = soup.find('h1')
            if title_elem:
                docx.add_heading(self._extract_text_content(title_elem), level=0)

            # Extract meta information
            meta_section = soup.find('div', class_='document-meta')
            if meta_section:
                for p in meta_section.find_all('p'):
                    docx.add_paragraph(self._extract_text_content(p))
                docx.add_paragraph("")  # Add spacing

            # Extract main content sections
            sections = soup.find_all(['h2', 'h3', 'p', 'ul', 'table', 'img'])

            for elem in sections:
                if elem.name == 'h2':
                    docx.add_heading(self._extract_text_content(elem), level=1)
                elif elem.name == 'h3':
                    docx.add_heading(self._extract_text_content(elem), level=2)
                elif elem.name == 'p':
                    text = self._extract_text_content(elem)
                    if text:  # Only add non-empty paragraphs
                        docx.add_paragraph(text)
                elif elem.name == 'img':
                    # Add image if it was successfully extracted
                    if elem in image_map:
                        try:
                            image_path = image_map[elem]
                            # Get image dimensions
                            img_obj = Image.open(image_path)
                            img_width, img_height = img_obj.size

                            # DOCX width in cm (A4 width ~21cm, with margins ~16cm usable)
                            max_docx_width = 15.0  # cm
                            # Calculate appropriate width based on image dimensions
                            # High-res images: 1cm ≈ 80px at 144 DPI (vs 53.33px at 96 DPI)
                            # This displays smaller but maintains high resolution
                            dpi_conversion = 80.0  # pixels per cm at 144 DPI
                            docx_width_raw = img_width / dpi_conversion

                            # If image is too wide, scale it down maintaining aspect ratio
                            if docx_width_raw > max_docx_width:
                                scale_factor = max_docx_width / docx_width_raw
                                calculated_width = max_docx_width
                                calculated_height = (img_height / dpi_conversion) * scale_factor
                            else:
                                calculated_width = docx_width_raw
                                calculated_height = img_height / dpi_conversion

                            docx.add_image(image_path, width=calculated_width, height=calculated_height)
                            print(f"[DEBUG] Added image to DOCX ({img_width}x{img_height}px -> {calculated_width:.2f}x{calculated_height:.2f}cm, aspect ratio preserved)")
                        except Exception as e:
                            print(f"Error adding image to DOCX: {e}")
                elif elem.name == 'ul':
                    for li in elem.find_all('li'):
                        docx.add_paragraph(f"• {self._extract_text_content(li)}")
                elif elem.name == 'table':
                    # Extract table data
                    rows = []
                    for tr in elem.find_all('tr'):
                        row = [self._extract_text_content(td) for td in tr.find_all(['th', 'td'])]
                        rows.append(row)
                    if rows:
                        docx.add_table(rows)

            # Save DOCX
            docx.save(output_path)

        finally:
            # Clean up temporary image files
            for img in images:
                if img['temp'] and os.path.exists(img['path']):
                    try:
                        os.unlink(img['path'])
                    except:
                        pass

        return output_path

    def _convert_to_hwp(self, soup: BeautifulSoup, output_path: str) -> str:
        """Convert HTML to HWP using HWPDocument"""
        hwp = HWPDocument()

        # Process canvas elements (Chart.js charts)
        self._process_canvas_elements(soup)

        # Extract images
        images = self._extract_images(soup)
        image_map = {img['element']: img['path'] for img in images}

        try:
            # Extract title
            title_elem = soup.find('h1')
            if title_elem:
                hwp.add_heading(self._extract_text_content(title_elem), level=1)

            # Extract meta information
            meta_section = soup.find('div', class_='document-meta')
            if meta_section:
                for p in meta_section.find_all('p'):
                    hwp.add_paragraph(self._extract_text_content(p))
                hwp.add_paragraph("")  # Add spacing

            # Extract main content sections
            sections = soup.find_all(['h2', 'h3', 'p', 'ul', 'table', 'img'])

            for elem in sections:
                if elem.name == 'h2':
                    hwp.add_heading(self._extract_text_content(elem), level=2)
                elif elem.name == 'h3':
                    hwp.add_heading(self._extract_text_content(elem), level=3)
                elif elem.name == 'p':
                    text = self._extract_text_content(elem)
                    if text:  # Only add non-empty paragraphs
                        hwp.add_paragraph(text)
                elif elem.name == 'img':
                    # Add image if it was successfully extracted
                    if elem in image_map:
                        try:
                            image_path = image_map[elem]
                            # HWP: Don't pass width/height - let HWPDocument handle sizing automatically
                            # This avoids issues with parameter compatibility
                            hwp.add_image(image_path)
                            print(f"[DEBUG] Added image to HWP: {image_path}")
                        except Exception as e:
                            print(f"Error adding image to HWP: {e}")
                elif elem.name == 'ul':
                    for li in elem.find_all('li'):
                        hwp.add_paragraph(f"• {self._extract_text_content(li)}")
                elif elem.name == 'table':
                    # Extract table data
                    rows = []
                    for tr in elem.find_all('tr'):
                        row = [self._extract_text_content(td) for td in tr.find_all(['th', 'td'])]
                        rows.append(row)
                    if rows:
                        hwp.add_table(rows)

            # Save HWP
            hwp.save(output_path)

        finally:
            # Clean up temporary image files
            for img in images:
                if img['temp'] and os.path.exists(img['path']):
                    try:
                        os.unlink(img['path'])
                    except:
                        pass

        return output_path


# CLI interface for testing
if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Convert HTML ESG report to document format')
    parser.add_argument('input_file', help='Input HTML file path')
    parser.add_argument('output_format', choices=['pdf', 'docx', 'hwp'], help='Output format')
    parser.add_argument('-o', '--output', help='Output file path (optional)')

    args = parser.parse_args()

    # Read HTML content
    with open(args.input_file, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # Convert
    converter = ESGHTMLConverter()
    output_path = converter.convert(html_content, args.output_format, args.output)

    print(f"✓ Converted successfully: {output_path}")
