"""Import papers from Movebank Data Repository API and literature page."""
import json
import re
import subprocess
import time

from app import create_app
from app.extensions import db
from app.models.paper import Paper


def fetch_json_curl(url):
    """Fetch JSON from a URL using curl."""
    result = subprocess.run(
        ['curl', '-s', '-H', 'Accept: application/json', '-H', 'User-Agent: Mozilla/5.0', url],
        capture_output=True, text=True, timeout=60
    )
    return json.loads(result.stdout)


def fetch_html_curl(url):
    """Fetch HTML from a URL using curl."""
    result = subprocess.run(
        ['curl', '-s', '-L', '-H', 'User-Agent: Mozilla/5.0', url],
        capture_output=True, text=True, timeout=120
    )
    return result.stdout


def flatten_metadata(metadata):
    """Flatten DSpace 7+ nested metadata into {key: [values]}."""
    flat = {}
    if not isinstance(metadata, dict):
        return flat
    for key, entries in metadata.items():
        if isinstance(entries, list):
            flat[key] = [e.get('value', '') for e in entries if isinstance(e, dict)]
        elif isinstance(entries, dict) and 'value' in entries:
            flat[key] = [entries['value']]
    return flat


def fv(flat, key):
    """Get first value from flattened metadata."""
    vals = flat.get(key, [])
    return vals[0] if vals else None


def clean_doi(doi_str):
    """Normalize a DOI string."""
    if not doi_str:
        return None
    doi_str = doi_str.replace('https://doi.org/', '').replace('http://doi.org/', '')
    doi_str = doi_str.replace('https://dx.doi.org/', '').replace('http://dx.doi.org/', '')
    if doi_str.startswith('doi:'):
        doi_str = doi_str[4:]
    return doi_str.strip().rstrip('.')


def import_data_repository():
    """Import papers from the Movebank Data Repository DSpace API."""
    base_url = 'https://datarepository.movebank.org/server/api/discover/search/objects'
    scope = 'e1ae0743-5f80-4700-b21f-0535a10daee9'
    page_size = 100
    total_imported = 0
    page = 0

    while True:
        url = f'{base_url}?query=*&dsoType=ITEM&scope={scope}&size={page_size}&page={page}'
        print(f'Fetching page {page}...')
        try:
            data = fetch_json_curl(url)
        except Exception as e:
            print(f'  Error fetching page {page}: {e}')
            break

        embedded = data.get('_embedded', {})
        search_result = embedded.get('searchResult', {})
        sr_embedded = search_result.get('_embedded', {})
        objects = sr_embedded.get('objects', [])

        if not objects:
            print(f'  No more results on page {page}.')
            break

        print(f'  Got {len(objects)} items')

        for obj in objects:
            try:
                item = obj.get('_embedded', {}).get('indexableObject', obj)
                flat = flatten_metadata(item.get('metadata', {}))

                dataset_doi = clean_doi(fv(flat, 'dc.identifier.doi'))
                title = fv(flat, 'dc.title') or item.get('name', '')
                authors_list = flat.get('dc.contributor.author', [])
                authors = '; '.join(authors_list) if authors_list else None

                # Publication DOI
                pub_doi = clean_doi(fv(flat, 'dc.relation.isreferencedby'))
                if not pub_doi:
                    related = flat.get('datacite.RelatedIdentifier', [])
                    for r in related:
                        if r and 'doi.org' in r:
                            pub_doi = clean_doi(r)
                            break

                journal = fv(flat, 'mdr.journal.title')
                citation = fv(flat, 'dc.identifier.citation')
                bibtex = fv(flat, 'mdr.citation.BibTex')
                abstract = fv(flat, 'dc.description.abstract')
                study_id = fv(flat, 'mdr.study.id')

                date_str = fv(flat, 'dc.date.issued') or fv(flat, 'dc.date.submitted')
                year = None
                if date_str:
                    m = re.search(r'(\d{4})', date_str)
                    if m:
                        year = int(m.group(1))

                species = '; '.join(flat.get('dwc.ScientificName', [])) or None
                sensors = '; '.join(flat.get('sensor.name', [])) or None
                keywords = '; '.join(flat.get('dc.subject', [])) or None

                animal_count = fv(flat, 'mdr.animal.count')
                location_count = fv(flat, 'mdr.location.count')

                unique_doi = pub_doi or dataset_doi
                if not unique_doi and not title:
                    continue

                # Upsert
                existing = None
                if unique_doi:
                    existing = Paper.query.filter_by(doi=unique_doi).first()
                if not existing and dataset_doi:
                    existing = Paper.query.filter_by(dataset_doi=dataset_doi).first()

                p = existing or Paper()
                if not existing:
                    db.session.add(p)

                p.doi = unique_doi
                p.title = title
                p.authors = authors
                p.journal = journal
                p.year = year
                p.abstract = abstract
                p.citation_text = citation
                p.bibtex = bibtex
                p.movebank_study_id = study_id
                p.dataset_doi = dataset_doi
                p.species = species
                p.sensor_types = sensors
                p.keywords = keywords
                if animal_count:
                    try:
                        p.animal_count = int(animal_count)
                    except (ValueError, TypeError):
                        pass
                if location_count:
                    try:
                        p.location_count = int(location_count)
                    except (ValueError, TypeError):
                        pass
                p.source = 'datarepository'
                p.source_url = item.get('_links', {}).get('self', {}).get('href', '')

                total_imported += 1

            except Exception as e:
                print(f'  Error processing item: {e}')
                continue

        db.session.commit()
        print(f'  Committed. Total so far: {total_imported}')

        page_info = search_result.get('page', {})
        total_pages = page_info.get('totalPages', 0)
        if page + 1 >= total_pages:
            break
        page += 1
        time.sleep(1)

    print(f'Data Repository import complete: {total_imported} papers.')
    return total_imported


def import_literature_page():
    """Import papers from the Movebank literature HTML page."""
    url = 'https://www.movebank.org/cms/movebank-content/literature'
    print('Fetching literature page...')

    try:
        html = fetch_html_curl(url)
    except Exception as e:
        print(f'Error fetching literature page: {e}')
        return 0

    print(f'  Got {len(html)} bytes')

    # Strip all HTML tags to get plain text
    text = re.sub(r'<[^>]+>', '\n', html)
    # Normalize whitespace within lines but preserve line breaks
    lines = text.split('\n')
    lines = [re.sub(r'\s+', ' ', l).strip() for l in lines]
    lines = [l for l in lines if l]

    # Also extract DOIs from the original HTML (they're in href attributes)
    doi_links = {}  # map citation text fragment -> DOI
    for m in re.finditer(r'href="[^"]*doi\.org/(10\.\d{4,}/[^"]+)"[^>]*>([^<]*)', html):
        doi = m.group(1).rstrip('.')
        context = m.group(2).strip()
        if context:
            doi_links[context[:50]] = doi

    # Rebuild citations by joining lines that form a single reference
    # Citations start with author names (capitalized word followed by comma or space)
    # and contain a year
    doi_pattern = re.compile(r'(?:https?://)?(?:dx\.)?doi\.org/(10\.\d{4,}/\S+)', re.IGNORECASE)
    citations = []
    current = ''

    for line in lines:
        # Check if this line starts a new citation
        # Heuristic: starts with a capitalized word (author surname) and the previous
        # accumulated text already has a year (meaning it's a complete citation)
        starts_new = bool(re.match(r'^[A-Z][a-zà-ü]+[ ,]', line))
        has_year = bool(re.search(r'(?:19|20)\d{2}', current))

        if starts_new and has_year and len(current) > 50:
            citations.append(current)
            current = line
        elif line:
            current = (current + ' ' + line).strip() if current else line

    if current and len(current) > 50:
        citations.append(current)

    print(f'  Parsed {len(citations)} candidate citations')

    total_imported = 0
    skipped_dup = 0

    for cit in citations:
        # Must contain a year
        year_match = re.search(r'[\(,. ]((?:19|20)\d{2})[a-z]?[\),.;\s]', cit)
        if not year_match:
            continue

        year = int(year_match.group(1))
        if year < 1990 or year > 2030:
            continue

        # Extract DOI
        doi_match = doi_pattern.search(cit)
        doi = None
        if doi_match:
            doi = clean_doi(doi_match.group(1))

        # Also check our href-extracted DOIs
        if not doi:
            for fragment, d in doi_links.items():
                if fragment in cit:
                    doi = clean_doi(d)
                    break

        # Skip if already in DB
        if doi:
            existing = Paper.query.filter_by(doi=doi).first()
            if existing:
                skipped_dup += 1
                continue

        # Parse authors and title
        # Pattern: "Authors (Year) Title. Journal..."
        authors = None
        title = None

        # Try: Authors (YYYY) Title.
        m = re.match(r'^(.+?)\s*\((?:19|20)\d{2}[a-z]?\)\s*(.+?)\.', cit)
        if m:
            authors = m.group(1).strip().rstrip(',').rstrip('.')
            title = m.group(2).strip()

        if not title:
            # Try: Authors. YYYY. Title.
            m = re.match(r'^(.+?)\.\s*(?:19|20)\d{2}[a-z]?\.\s*(.+?)\.', cit)
            if m:
                authors = m.group(1).strip()
                title = m.group(2).strip()

        if not title:
            # Try: Authors, YYYY. Title.
            m = re.match(r'^(.+?),\s*(?:19|20)\d{2}[a-z]?\.\s*(.+?)\.', cit)
            if m:
                authors = m.group(1).strip()
                title = m.group(2).strip()

        if not title:
            title = cit[:300]

        # Skip very short or non-citation text
        if len(cit) < 40:
            continue

        # Dedup by title prefix if no DOI
        if not doi and title and len(title) > 30:
            title_prefix = title[:60].replace("'", "''")
            existing = Paper.query.filter(Paper.title.ilike(f'{title_prefix}%')).first()
            if existing:
                skipped_dup += 1
                continue

        p = Paper(
            doi=doi,
            title=title,
            authors=authors,
            year=year,
            citation_text=cit[:2000],
            source='literature',
            source_url=url,
        )
        db.session.add(p)
        total_imported += 1

        if total_imported % 100 == 0:
            db.session.commit()
            print(f'  ... {total_imported} imported so far')

    db.session.commit()
    print(f'Literature page import complete: {total_imported} new, {skipped_dup} skipped as duplicates.')
    return total_imported


if __name__ == '__main__':
    app = create_app()
    with app.app_context():
        # Clear existing to re-import cleanly
        existing_count = Paper.query.count()
        if existing_count > 0:
            print(f'Clearing {existing_count} existing papers...')
            Paper.query.delete()
            db.session.commit()

        print('=== Importing from Movebank Data Repository ===')
        n1 = import_data_repository()

        print()
        print('=== Importing from Movebank Literature Page ===')
        n2 = import_literature_page()

        total = Paper.query.count()
        print(f'\n=== Done. Total papers in database: {total} ===')
