import json
import re
from datetime import datetime, timezone
from html import unescape
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse


PARAMETER_DEFS = [{"key": "post_url", "required": True, "type": "string"}]

OUTPUT_SCHEMA = [
    {
        "key": "comment_id",
        "type": "string",
        "description": "Unique identifier of the extracted comment entry.",
    },
    {
        "key": "post_url",
        "type": "string",
        "description": "Direct URL to the extracted social post.",
    },
    {
        "key": "author_username",
        "type": "string",
        "description": "Username of the content creator or posting account.",
    },
    {
        "key": "comment_text",
        "type": "string",
        "description": "Text body of the extracted user comment.",
    },
    {
        "key": "published_at",
        "type": "datetime",
        "description": "Publication timestamp shown for the content.",
    },
]

PROMPT = "Extract Instagram comments from public post URLs."
MAX_PAGES = 20
SCHEMA_ID = "instagram-comments"


_FIELD_PLACEHOLDER_MARKERS = (
    "new tab",
    "access denied",
    "robot check",
    "captcha",
    "verify you are human",
    "please wait",
    "restricted profile",
)

_PAGE_BLOCKED_MARKERS = (
    "challenge required",
    "login required",
    "checkpoint",
    "page not found",
    "access denied",
    "robot check",
    "captcha",
    "verify you are human",
)
_HARD_BLOCKED_MARKERS = (
    "challenge required",
    "login required",
    "checkpoint",
    "page not found",
)
_SOFT_BLOCKED_MARKERS = (
    "new tab",
    "access denied",
    "robot check",
    "captcha",
    "verify you are human",
)

_USERNAME_RE = re.compile(r"^[a-zA-Z0-9._]{1,30}$")
_SCRIPT_TAG_RE = re.compile(r"<script(?P<attrs>[^>]*)>(?P<body>.*?)</script>", re.IGNORECASE | re.DOTALL)
_JSON_PARSE_RE = re.compile(r"JSON\.parse\(\"((?:\\.|[^\"\\])+)\"\)")
_HTML_TAG_RE = re.compile(r"<[^>]+>")
_STYLE_BLOCK_RE = re.compile(r"<style[^>]*>.*?</style>", re.IGNORECASE | re.DOTALL)
_HTML_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
_POST_PATH_RE = re.compile(r"^/(?:p|reel|tv)/[A-Za-z0-9_-]+/?$")


def build_url(parameters=None, start_url=None):
    if isinstance(start_url, str) and start_url.strip():
        return _normalize_post_url(start_url)

    if isinstance(parameters, dict):
        post_url = parameters.get("post_url")
        if isinstance(post_url, str) and post_url.strip():
            return _normalize_post_url(post_url)

    return ""


def execute(
    client,
    parameters=None,
    max_pages=None,
    follow_url_filter=None,
    start_url=None,
    on_page=None,
    should_cancel=None,
):
    rows = []
    errors = []
    pages_attempted = 0
    pages_succeeded = 0

    initial_url = build_url(parameters=parameters, start_url=start_url)
    if not initial_url:
        errors.append("input_diagnostic: missing required parameter post_url.")
        return {
            "rows": rows,
            "schema_id": SCHEMA_ID,
            "pages_attempted": pages_attempted,
            "pages_succeeded": pages_succeeded,
            "errors": errors,
        }

    page_limit = _resolve_page_limit(max_pages)
    debug_capture = _strict_bool_flag(
        parameters.get("__debug_capture_screenshots") if isinstance(parameters, dict) else None
    )
    source_post_url = initial_url

    seen_page_urls = set()
    next_page_url = initial_url
    last_pagination_state = {
        "marker_detected": False,
        "stop_reason": "",
    }

    while next_page_url and pages_attempted < page_limit:
        if callable(should_cancel):
            try:
                if should_cancel():
                    errors.append(
                        "runtime_diagnostic: execution cancelled before processing next page."
                    )
                    break
            except Exception as exc:
                errors.append(f"runtime_diagnostic: should_cancel callback failed: {exc}")

        current_url = next_page_url
        next_page_url = ""

        if current_url in seen_page_urls:
            errors.append(
                f"page_diagnostic: pagination loop detected at {current_url}; traversal stopped."
            )
            break
        seen_page_urls.add(current_url)

        pages_attempted += 1

        options = {
            "wait_until": "networkidle",
            "wait_ms": 1200,
            "timeout_ms": 45000,
            "screenshot": debug_capture,
        }

        try:
            response = client.download_html(url=current_url, options=options)
        except Exception as exc:
            errors.append(f"page_diagnostic: download_html failed for {current_url}: {exc}")
            continue

        html = response.html if isinstance(response.html, str) else ""
        title = response.title if isinstance(response.title, str) else ""
        screenshot_url = ""
        if debug_capture and isinstance(response.screenshot_url, str):
            screenshot_url = response.screenshot_url

        pages_succeeded += 1

        source_post_url = _resolve_post_url_from_html(
            html=html,
            fallback_post_url=source_post_url,
        )

        page_rows, parse_meta = _extract_rows_from_page(
            html=html,
            page_url=current_url,
            output_post_url=source_post_url,
            pagination_base_url=source_post_url,
        )
        marker_hits = _detect_markers(html=html, title=title, has_rows=bool(page_rows))
        if marker_hits:
            errors.append(
                f"page_diagnostic: blocked or placeholder markers detected on {current_url}: "
                + ", ".join(marker_hits)
            )

        page_new_rows = list(page_rows)
        rows.extend(page_rows)

        next_candidate = parse_meta.get("next_page_url", "")
        marker_detected = bool(parse_meta.get("pagination_marker_detected"))

        if next_candidate:
            next_candidate = _coerce_next_page_url(
                next_candidate=next_candidate,
                source_post_url=source_post_url,
            )
            next_candidate = _normalize_follow_url(current_url, next_candidate)
            valid_target, invalid_reason = _is_valid_pagination_target(
                current_url=current_url,
                next_url=next_candidate,
                source_post_url=source_post_url,
            )
            if not valid_target:
                marker_detected = True
                last_pagination_state["marker_detected"] = True
                last_pagination_state["stop_reason"] = invalid_reason
                errors.append(
                    f"page_diagnostic: rejected pagination target outside post context on {current_url}: "
                    f"{next_candidate} ({invalid_reason})"
                )
            elif not _pagination_progresses(current_url, next_candidate):
                marker_detected = True
                last_pagination_state["marker_detected"] = True
                last_pagination_state["stop_reason"] = "next_page_url did not advance pagination state"
                errors.append(
                    f"page_diagnostic: pagination marker on {current_url} yielded a non-progressing next_page_url: {next_candidate}"
                )
            elif _is_follow_blocked(next_candidate, follow_url_filter):
                marker_detected = True
                last_pagination_state["marker_detected"] = True
                last_pagination_state["stop_reason"] = "follow_url_filter blocked next page"
                errors.append(
                    f"page_diagnostic: pagination marker on {current_url} blocked by follow_url_filter: {next_candidate}"
                )
            elif next_candidate in seen_page_urls:
                marker_detected = True
                last_pagination_state["marker_detected"] = True
                last_pagination_state["stop_reason"] = "next page already visited"
                errors.append(
                    f"page_diagnostic: pagination marker on {current_url} points to an already visited page: {next_candidate}"
                )
            else:
                next_page_url = next_candidate
                last_pagination_state["marker_detected"] = True
                last_pagination_state["stop_reason"] = ""
        elif marker_detected and pages_attempted < page_limit:
            last_pagination_state["marker_detected"] = True
            last_pagination_state["stop_reason"] = "pagination marker found but next_page_url missing"
            errors.append(
                f"page_diagnostic: pagination marker detected on {current_url} but next_page_url was not extractable."
            )

        if callable(on_page):
            event = {
                "page_url": current_url,
                "page_index": pages_attempted,
                "page_rows": page_new_rows,
            }
            if debug_capture:
                event["page_screenshot_url"] = screenshot_url
            try:
                on_page(event)
            except Exception as exc:
                errors.append(f"runtime_diagnostic: on_page callback failed on {current_url}: {exc}")

    if pages_succeeded > 0 and not rows:
        errors.append(
            "parse_diagnostic: page downloads succeeded but no comment rows were extracted."
        )

    errors.extend(_validate_rows(rows))

    if (
        last_pagination_state.get("marker_detected")
        and not next_page_url
        and pages_attempted < page_limit
        and last_pagination_state.get("stop_reason")
    ):
        errors.append(
            "page_diagnostic: pagination traversal stopped early while marker was present "
            f"({last_pagination_state['stop_reason']})."
        )

    return {
        "rows": rows,
        "schema_id": SCHEMA_ID,
        "pages_attempted": pages_attempted,
        "pages_succeeded": pages_succeeded,
        "errors": errors,
    }


def _resolve_page_limit(max_pages):
    if max_pages is None:
        return MAX_PAGES
    try:
        value = int(max_pages)
    except Exception:
        return MAX_PAGES
    if value <= 0:
        return MAX_PAGES
    return min(value, MAX_PAGES)


def _strict_bool_flag(value):
    if value is True:
        return True
    if isinstance(value, str):
        return value.strip().lower() in {"1", "true", "yes", "on"}
    return False


def _normalize_post_url(url):
    url = (url or "").strip()
    if not url:
        return ""

    parsed = urlparse(url)
    if not parsed.scheme:
        parsed = urlparse("https://" + url)

    scheme = parsed.scheme or "https"
    netloc = parsed.netloc.lower()
    path = parsed.path or "/"
    if not path.endswith("/"):
        path = path + "/"

    query_pairs = []
    for key, value in parse_qsl(parsed.query, keep_blank_values=False):
        if key in {"hl", "igshid", "igsh", "utm_source", "utm_medium"}:
            continue
        query_pairs.append((key, value))

    normalized = urlunparse((scheme, netloc, path, "", urlencode(query_pairs), ""))
    return normalized


def _normalize_follow_url(current_url, candidate_url):
    if not candidate_url:
        return ""
    resolved = urljoin(current_url, candidate_url)
    parsed = urlparse(resolved)
    query_pairs = [(k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=False)]
    return urlunparse(
        (
            parsed.scheme or "https",
            parsed.netloc,
            parsed.path,
            "",
            urlencode(query_pairs),
            "",
        )
    )


def _is_follow_blocked(url, follow_url_filter):
    if follow_url_filter is None:
        return False

    try:
        if callable(follow_url_filter):
            return not bool(follow_url_filter(url))
    except Exception:
        return True

    if isinstance(follow_url_filter, str):
        return follow_url_filter not in url

    pattern = getattr(follow_url_filter, "search", None)
    if callable(pattern):
        try:
            return pattern(url) is None
        except Exception:
            return True

    return False


def _detect_markers(html, title, has_rows=False):
    visible_text = _extract_visible_text(html)
    text = _clean_whitespace(f"{title} {visible_text}").lower()
    has_content = has_rows or _has_content_markers(html=html, visible_text=visible_text)
    looks_placeholder = _looks_like_placeholder_page(title=title, visible_text=visible_text)

    markers = []

    for marker in _HARD_BLOCKED_MARKERS:
        if marker in text and marker not in markers:
            if not has_content or looks_placeholder:
                markers.append(marker)

    for marker in _SOFT_BLOCKED_MARKERS:
        if marker not in text or marker in markers:
            continue
        if marker == "new tab":
            if looks_placeholder and not has_content:
                markers.append(marker)
            continue
        if not has_content or looks_placeholder:
            markers.append(marker)

    return markers


def _has_content_markers(html, visible_text):
    html_lc = (html or "").lower()
    text_lc = (visible_text or "").lower()

    html_markers = (
        "edge_media_to_parent_comment",
        "edge_media_to_comment",
        "xdt_shortcode_media",
        '"comment_count"',
        "/c/",
        "listing-card",
        "data-listing-id",
        "search-results",
        "results-list",
    )
    if any(marker in html_lc for marker in html_markers):
        return True

    if re.search(r"\b\d+\s+comments?\b", text_lc):
        return True

    if re.search(r"@[a-z0-9._]{1,30}", text_lc) and re.search(
        r"\b(comment|reply|replies)\b", text_lc
    ):
        return True

    return False


def _looks_like_placeholder_page(title, visible_text):
    title_lc = _clean_whitespace(title).lower()
    text_lc = _clean_whitespace(visible_text).lower()

    if title_lc in {"new tab", "access denied", "robot check"} and len(text_lc) < 220:
        return True

    if len(text_lc) < 160 and any(marker in text_lc for marker in _PAGE_BLOCKED_MARKERS):
        return True

    return False


def _extract_rows_from_page(html, page_url, output_post_url, pagination_base_url):
    post_url = _normalize_post_url(output_post_url or page_url)
    pagination_url = _normalize_post_url(pagination_base_url or page_url)

    json_roots = _collect_json_roots(html)

    rows = []

    for root in json_roots:
        extracted = _extract_rows_from_json_root(root, post_url=post_url)
        rows.extend(extracted)

    if not rows:
        dom_rows = _extract_rows_from_dom(html=html, post_url=post_url)
        rows.extend(dom_rows)

    next_page_url, pagination_marker = _extract_next_page_url(
        html=html,
        current_url=pagination_url,
        json_roots=json_roots,
    )

    return rows, {
        "next_page_url": next_page_url,
        "pagination_marker_detected": pagination_marker,
    }


def _collect_json_roots(html):
    roots = []
    serialized_seen = set()

    for match in _SCRIPT_TAG_RE.finditer(html):
        attrs = match.group("attrs") or ""
        body = (match.group("body") or "").strip()
        if not body:
            continue

        for root in _json_candidates_from_script(attrs, body):
            marker = _json_fingerprint(root)
            if marker in serialized_seen:
                continue
            serialized_seen.add(marker)
            roots.append(root)

    return roots


def _json_candidates_from_script(attrs, body):
    candidates = []

    lower_attrs = attrs.lower()
    lower_body = body.lower()

    if "application/json" in lower_attrs or body[:1] in "[{":
        parsed = _safe_json_loads(body)
        if parsed is not None:
            candidates.append(parsed)

    for marker in (
        "window._sharedData",
        "window.__additionalDataLoaded",
        "window.__initialDataLoaded",
        "window.__initialData",
    ):
        extracted = _extract_json_after_marker(body, marker)
        if extracted is not None:
            parsed = _safe_json_loads(extracted)
            if parsed is not None:
                candidates.append(parsed)

    for m in _JSON_PARSE_RE.finditer(body):
        try:
            decoded = bytes(m.group(1), "utf-8").decode("unicode_escape")
        except Exception:
            continue
        parsed = _safe_json_loads(decoded)
        if parsed is not None:
            candidates.append(parsed)

    if "has_next_page" in lower_body and "end_cursor" in lower_body:
        snippet = _extract_balanced_from_token(body, "{", "}")
        parsed = _safe_json_loads(snippet) if snippet else None
        if parsed is not None:
            candidates.append(parsed)

    return candidates


def _extract_json_after_marker(text, marker):
    idx = text.find(marker)
    if idx == -1:
        return None

    brace_idx = text.find("{", idx)
    bracket_idx = text.find("[", idx)

    start_idx = -1
    open_ch = ""
    close_ch = ""

    if brace_idx != -1 and (bracket_idx == -1 or brace_idx < bracket_idx):
        start_idx = brace_idx
        open_ch = "{"
        close_ch = "}"
    elif bracket_idx != -1:
        start_idx = bracket_idx
        open_ch = "["
        close_ch = "]"

    if start_idx == -1:
        return None

    return _extract_balanced_segment(text, start_idx, open_ch, close_ch)


def _extract_balanced_from_token(text, open_ch, close_ch):
    start_idx = text.find(open_ch)
    if start_idx == -1:
        return None
    return _extract_balanced_segment(text, start_idx, open_ch, close_ch)


def _extract_balanced_segment(text, start_idx, open_ch, close_ch):
    depth = 0
    in_string = False
    escape = False

    for i in range(start_idx, len(text)):
        ch = text[i]

        if in_string:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_string = False
            continue

        if ch == '"':
            in_string = True
            continue

        if ch == open_ch:
            depth += 1
        elif ch == close_ch:
            depth -= 1
            if depth == 0:
                return text[start_idx : i + 1]

    return None


def _safe_json_loads(raw):
    if not isinstance(raw, str):
        return None

    candidate = raw.strip()
    if not candidate:
        return None

    if candidate.endswith(";"):
        candidate = candidate[:-1]

    try:
        return json.loads(candidate)
    except Exception:
        return None


def _json_fingerprint(value):
    try:
        return json.dumps(value, sort_keys=True, separators=(",", ":"))
    except Exception:
        return str(type(value))


def _extract_rows_from_json_root(root, post_url):
    rows = []

    stack = [(root, "")]

    while stack:
        current, path = stack.pop()

        if isinstance(current, dict):
            row = _row_from_comment_node(node=current, path=path, post_url=post_url)
            if row:
                rows.append(row)

            for key, value in current.items():
                child_path = f"{path}.{key}" if path else str(key)
                if isinstance(value, (dict, list)):
                    stack.append((value, child_path))

        elif isinstance(current, list):
            for idx, value in enumerate(current):
                child_path = f"{path}[{idx}]" if path else f"[{idx}]"
                if isinstance(value, (dict, list)):
                    stack.append((value, child_path))

    return rows


def _row_from_comment_node(node, path, post_url):
    path_lc = path.lower()
    comment_identity_keys = (
        "is_ranked_comment",
        "did_report_as_spam",
        "comment_like_count",
        "child_comment_count",
        "parent_comment_id",
    )

    commentish = (
        "comment" in path_lc
        or "thread" in path_lc
        or any(k in node for k in comment_identity_keys)
    )

    if not commentish:
        return None

    comment_id = _extract_comment_id(node=node, path_lc=path_lc)
    if not comment_id:
        return None

    comment_text = _extract_comment_text(node)
    if not comment_text:
        return None

    author_username = _extract_author_username(node)
    if not author_username:
        return None

    published_at = _extract_published_at(node)

    return {
        "comment_id": comment_id,
        "post_url": post_url,
        "author_username": author_username,
        "comment_text": comment_text,
        "published_at": published_at,
    }


def _extract_comment_id(node, path_lc=""):
    for key in ("comment_id", "comment_pk", "pk"):
        if key in node:
            val = _clean_whitespace(str(node.get(key, "")))
            if _looks_like_comment_id(val):
                return val

    raw_id = _clean_whitespace(str(node.get("id", "")))
    if raw_id and ("comment" in path_lc or "thread" in path_lc):
        if _looks_like_comment_id(raw_id):
            return raw_id

    permalink = node.get("permalink") or node.get("url")
    if isinstance(permalink, str):
        match = re.search(r"/c/(\d+)", permalink)
        if match:
            return match.group(1)

    return ""


def _looks_like_comment_id(value):
    if re.fullmatch(r"\d{5,}", value):
        return True
    if re.fullmatch(r"[A-Za-z0-9_\-]{12,}", value):
        return True
    return False


def _extract_comment_text(node):
    direct_candidates = []
    for key in ("text", "comment_text", "body"):
        if key in node and isinstance(node.get(key), str):
            direct_candidates.append(node.get(key))

    for value in direct_candidates:
        cleaned = _clean_whitespace(value)
        if cleaned and not _is_noise_text(cleaned) and not _has_placeholder_marker(cleaned):
            return cleaned

    for key in ("content", "comment"):
        val = node.get(key)
        if isinstance(val, dict):
            for child_key in ("text", "body"):
                child_value = val.get(child_key)
                if isinstance(child_value, str):
                    cleaned = _clean_whitespace(child_value)
                    if cleaned and not _is_noise_text(cleaned) and not _has_placeholder_marker(cleaned):
                        return cleaned

    return ""


def _extract_author_username(node):
    for key in ("author", "owner", "user"):
        value = node.get(key)
        if isinstance(value, dict):
            username = value.get("username")
            if isinstance(username, str):
                normalized = _normalize_username(username)
                if normalized:
                    return normalized

    username = node.get("username")
    if isinstance(username, str):
        normalized = _normalize_username(username)
        if normalized:
            return normalized

    return ""


def _extract_published_at(node):
    for key in (
        "created_at",
        "created_time",
        "createdAt",
        "timestamp",
        "taken_at",
        "date",
        "published_at",
    ):
        if key in node:
            normalized = _normalize_datetime(node.get(key))
            if normalized:
                return normalized

    for key in ("time", "meta"):
        value = node.get(key)
        if isinstance(value, dict):
            nested = _extract_published_at(value)
            if nested:
                return nested

    return ""


def _extract_rows_from_dom(html, post_url):
    rows = []

    for match in re.finditer(r"/(?:p|reel|tv)/[A-Za-z0-9_-]+/c/(?P<cid>\d+)", html):
        cid = match.group("cid")
        start = max(0, match.start() - 2500)
        end = min(len(html), match.end() + 1200)
        window = html[start:end]

        username = _extract_username_from_window(window)
        comment_text = _extract_comment_text_from_window(window, username)
        published_at = _extract_datetime_from_window(window)

        if not cid or not username or not comment_text:
            continue

        row = {
            "comment_id": cid,
            "post_url": post_url,
            "author_username": username,
            "comment_text": comment_text,
            "published_at": published_at,
        }
        rows.append(row)

    return rows


def _extract_username_from_window(window):
    username_matches = list(
        re.finditer(r'href="/(?P<username>[A-Za-z0-9._]{1,30})/?(?:[?#][^"]*)?"', window)
    )
    for candidate in reversed(username_matches):
        username = _normalize_username(candidate.group("username"))
        if username and username.lower() not in {"explore", "accounts", "reels", "direct"}:
            return username
    return ""


def _extract_comment_text_from_window(window, username):
    best = ""
    for match in re.finditer(r"<span[^>]*>(.*?)</span>", window, flags=re.IGNORECASE | re.DOTALL):
        text = _clean_html_fragment(match.group(1))
        if not text:
            continue
        if username and text.lower() == username.lower():
            continue
        if _is_noise_text(text):
            continue
        if _has_placeholder_marker(text):
            continue
        best = text
        break

    return best


def _extract_datetime_from_window(window):
    match = re.search(r'<time[^>]*datetime="([^"]+)"', window, flags=re.IGNORECASE)
    if not match:
        return ""
    return _normalize_datetime(match.group(1))


def _clean_html_fragment(fragment):
    if not fragment:
        return ""
    without_tags = _HTML_TAG_RE.sub(" ", fragment)
    decoded = unescape(without_tags)
    return _clean_whitespace(decoded)


def _is_noise_text(text):
    token = text.strip().lower()
    if not token:
        return True

    if len(token) <= 2:
        return True

    if re.fullmatch(r"view all \d+ comments?", token):
        return True
    if re.fullmatch(r"\d+ comments?", token):
        return True
    if re.fullmatch(r"\d+ replies?", token):
        return True

    noisy = {
        "like",
        "likes",
        "reply",
        "replies",
        "see translation",
        "view replies",
        "more",
        "follow",
        "following",
        "verified",
        "share",
        "send",
        "log in",
        "sign up",
    }
    return token in noisy


def _has_placeholder_marker(value):
    token = _clean_whitespace(value).lower()
    if not token:
        return False
    return any(marker in token for marker in _FIELD_PLACEHOLDER_MARKERS)


def _normalize_username(value):
    value = _clean_whitespace(value)
    if not value:
        return ""
    if value.startswith("@"):
        value = value[1:]
    return value if _USERNAME_RE.fullmatch(value) else ""


def _normalize_datetime(value):
    if value is None:
        return ""

    if isinstance(value, (int, float)):
        ts = float(value)
        if ts > 1_000_000_000_000:
            ts = ts / 1000.0
        if ts <= 0:
            return ""
        try:
            dt = datetime.fromtimestamp(ts, tz=timezone.utc)
        except Exception:
            return ""
        return dt.replace(microsecond=0).isoformat().replace("+00:00", "Z")

    value = _clean_whitespace(str(value))
    if not value:
        return ""

    if re.fullmatch(r"\d{10,13}", value):
        return _normalize_datetime(int(value))

    if value.endswith("Z"):
        return value

    if re.fullmatch(r"\d{4}-\d{2}-\d{2}([ T]\d{2}:\d{2}(:\d{2})?)?", value):
        return value.replace(" ", "T")

    return value


def _extract_next_page_url(html, current_url, json_roots):
    marker_detected = False

    next_link_match = re.search(
        r'<link[^>]+rel=["\']next["\'][^>]+href=["\']([^"\']+)["\']',
        html,
        flags=re.IGNORECASE,
    )
    if next_link_match:
        return next_link_match.group(1), True

    for root in json_roots:
        candidate, marker = _next_from_json_root(root, current_url)
        marker_detected = marker_detected or marker
        if candidate:
            return candidate, True

    text_lc = html.lower()
    if '"has_next_page":true' in text_lc or "has_next_page\\\":true" in text_lc:
        marker_detected = True
        cursor = _extract_cursor_from_html(html)
        if cursor:
            return _with_query(current_url, {"cursor": cursor}), True

    return "", marker_detected


def _next_from_json_root(root, current_url):
    marker_detected = False
    stack = [root]

    while stack:
        current = stack.pop()
        if isinstance(current, dict):
            page_info = current.get("page_info")
            if isinstance(page_info, dict):
                has_next = bool(page_info.get("has_next_page"))
                marker_detected = marker_detected or has_next
                if has_next:
                    cursor = page_info.get("end_cursor") or page_info.get("next_cursor")
                    if isinstance(cursor, str) and cursor:
                        return _with_query(current_url, {"cursor": cursor}), True

            paging = current.get("paging")
            if isinstance(paging, dict):
                next_url = paging.get("next")
                if isinstance(next_url, str) and next_url:
                    return next_url, True

            if isinstance(current.get("next_max_id"), str) and current.get("next_max_id"):
                marker_detected = True
                return _with_query(current_url, {"max_id": current.get("next_max_id")}), True

            if isinstance(current.get("next_page_url"), str) and current.get("next_page_url"):
                return current.get("next_page_url"), True

            for value in current.values():
                if isinstance(value, (dict, list)):
                    stack.append(value)

        elif isinstance(current, list):
            for value in current:
                if isinstance(value, (dict, list)):
                    stack.append(value)

    return "", marker_detected


def _extract_cursor_from_html(html):
    patterns = (
        r'"end_cursor"\s*:\s*"([^"]+)"',
        r"\\\"end_cursor\\\"\s*:\s*\\\"([^\\]+)\\\"",
        r'"next_max_id"\s*:\s*"([^"]+)"',
    )
    for pattern in patterns:
        match = re.search(pattern, html)
        if match:
            return match.group(1)
    return ""


def _with_query(url, params):
    parsed = urlparse(url)
    existing = dict(parse_qsl(parsed.query, keep_blank_values=False))
    existing.update({k: v for k, v in params.items() if v is not None and v != ""})
    return urlunparse(
        (parsed.scheme or "https", parsed.netloc, parsed.path, "", urlencode(existing), "")
    )


def _extract_visible_text(html):
    if not html:
        return ""
    text = _SCRIPT_TAG_RE.sub(" ", html)
    text = _STYLE_BLOCK_RE.sub(" ", text)
    text = _HTML_COMMENT_RE.sub(" ", text)
    text = _HTML_TAG_RE.sub(" ", text)
    return _clean_whitespace(unescape(text))


def _resolve_post_url_from_html(html, fallback_post_url):
    if not html:
        return _normalize_post_url(fallback_post_url)

    candidates = []
    canonical_match = re.search(
        r'<link[^>]+rel=["\']canonical["\'][^>]+href=["\']([^"\']+)["\']',
        html,
        flags=re.IGNORECASE,
    )
    if canonical_match:
        candidates.append(canonical_match.group(1))

    og_match = re.search(
        r'<meta[^>]+property=["\']og:url["\'][^>]+content=["\']([^"\']+)["\']',
        html,
        flags=re.IGNORECASE,
    )
    if og_match:
        candidates.append(og_match.group(1))

    for candidate in candidates:
        normalized = _normalize_post_url(candidate)
        if _is_instagram_host(urlparse(normalized).netloc):
            return normalized

    return _normalize_post_url(fallback_post_url)


def _coerce_next_page_url(next_candidate, source_post_url):
    if not next_candidate:
        return ""

    source_parsed = urlparse(source_post_url)
    candidate_parsed = urlparse(next_candidate)

    if not _is_instagram_host(source_parsed.netloc):
        return next_candidate

    if not candidate_parsed.netloc or candidate_parsed.netloc == source_parsed.netloc:
        return next_candidate

    candidate_query = dict(parse_qsl(candidate_parsed.query, keep_blank_values=False))
    if "cursor" in candidate_query and candidate_query["cursor"]:
        return _with_query(source_post_url, {"cursor": candidate_query["cursor"]})
    if "max_id" in candidate_query and candidate_query["max_id"]:
        return _with_query(source_post_url, {"max_id": candidate_query["max_id"]})

    return next_candidate


def _is_instagram_host(netloc):
    host = (netloc or "").lower()
    return host == "instagram.com" or host.endswith(".instagram.com")


def _is_valid_pagination_target(current_url, next_url, source_post_url):
    candidate = urlparse(next_url)
    current = urlparse(current_url)
    source = urlparse(source_post_url)

    if not _is_instagram_host(candidate.netloc):
        return False, "next page host is not instagram"

    if not _same_instagram_host_family(candidate.netloc, source.netloc):
        return False, "next page host does not match source host family"

    expected_path = _normalized_path(source.path)
    candidate_path = _normalized_path(candidate.path)
    if not _POST_PATH_RE.fullmatch(candidate_path):
        return False, "next page path is not a post/reel/tv URL"

    if expected_path and candidate_path != expected_path:
        return False, "next page path changed outside the source post"

    if current.path and candidate_path != _normalized_path(current.path):
        return False, "next page path changed from current post path"

    return True, ""


def _same_instagram_host_family(host_a, host_b):
    return _instagram_host_family(host_a) == _instagram_host_family(host_b)


def _instagram_host_family(host):
    host = (host or "").strip().lower()
    if host.startswith("www."):
        host = host[4:]
    return host


def _normalized_path(path):
    value = path or "/"
    if not value.endswith("/"):
        value = value + "/"
    return value


def _pagination_progresses(current_url, next_url):
    current = urlparse(current_url)
    candidate = urlparse(next_url)

    if _normalized_page_identity(current) != _normalized_page_identity(candidate):
        return True

    current_tokens = _pagination_tokens(current)
    candidate_tokens = _pagination_tokens(candidate)

    if current_tokens != candidate_tokens:
        return True

    current_cmp_qs = _comparison_query_params(current)
    candidate_cmp_qs = _comparison_query_params(candidate)
    if current_cmp_qs != candidate_cmp_qs:
        return True

    return False


def _normalized_page_identity(parsed):
    path = parsed.path or "/"
    if not path.endswith("/"):
        path = path + "/"
    host = parsed.netloc.lower()
    if _is_instagram_host(host):
        host = _instagram_host_family(host)
    return ((parsed.scheme or "https").lower(), host, path)


def _pagination_tokens(parsed):
    qs = dict(parse_qsl(parsed.query, keep_blank_values=False))
    return (
        qs.get("cursor", ""),
        qs.get("max_id", ""),
        qs.get("end_cursor", ""),
        qs.get("next_max_id", ""),
    )


def _comparison_query_params(parsed):
    ignored = {
        "x-amz-algorithm",
        "x-amz-credential",
        "x-amz-date",
        "x-amz-expires",
        "x-amz-signedheaders",
        "x-amz-signature",
        "x-amz-security-token",
    }
    pairs = []
    for key, value in parse_qsl(parsed.query, keep_blank_values=False):
        if key.lower() in ignored:
            continue
        pairs.append((key, value))
    return tuple(sorted(pairs))


def _clean_whitespace(value):
    if value is None:
        return ""
    value = str(value).replace("\u00a0", " ")
    value = re.sub(r"\s+", " ", value)
    return value.strip()


def _validate_rows(rows):
    diagnostics = []
    if not rows:
        return diagnostics

    required = ("comment_id", "post_url", "author_username", "comment_text")

    for field in required:
        empty_count = 0
        for row in rows:
            val = _clean_whitespace(row.get(field, ""))
            if not val:
                empty_count += 1
        if empty_count > 0:
            diagnostics.append(
                f"field_diagnostic: required field '{field}' empty in {empty_count}/{len(rows)} rows."
            )

    for field in ("author_username", "comment_text"):
        noisy_count = 0
        for row in rows:
            val = _clean_whitespace(row.get(field, "")).lower()
            if not val:
                continue
            if any(marker in val for marker in _FIELD_PLACEHOLDER_MARKERS):
                noisy_count += 1
        if noisy_count > 0:
            diagnostics.append(
                f"field_diagnostic: placeholder/noisy values found in '{field}' for {noisy_count}/{len(rows)} rows."
            )

    published_non_empty = 0
    for row in rows:
        if _clean_whitespace(row.get("published_at", "")):
            published_non_empty += 1
    if rows and (published_non_empty / float(len(rows))) < 0.70:
        diagnostics.append(
            "field_diagnostic: published_at non-empty ratio below 0.70; timestamp coverage is low."
        )

    return diagnostics
