import json import re from datetime import datetime, timezone from html import unescape from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse PARAMETER_DEFS = [{"key": "post_url", "required": True, "type": "string"}] OUTPUT_SCHEMA = [ { "key": "comment_id", "type": "string", "description": "Unique identifier of the extracted comment entry.", }, { "key": "post_url", "type": "string", "description": "Direct URL to the extracted social post.", }, { "key": "author_username", "type": "string", "description": "Username of the content creator or posting account.", }, { "key": "comment_text", "type": "string", "description": "Text body of the extracted user comment.", }, { "key": "published_at", "type": "datetime", "description": "Publication timestamp shown for the content.", }, ] PROMPT = "Extract Instagram comments from public post URLs." MAX_PAGES = 20 SCHEMA_ID = "instagram-comments" _FIELD_PLACEHOLDER_MARKERS = ( "new tab", "access denied", "robot check", "captcha", "verify you are human", "please wait", "restricted profile", ) _PAGE_BLOCKED_MARKERS = ( "challenge required", "login required", "checkpoint", "page not found", "access denied", "robot check", "captcha", "verify you are human", ) _HARD_BLOCKED_MARKERS = ( "challenge required", "login required", "checkpoint", "page not found", ) _SOFT_BLOCKED_MARKERS = ( "new tab", "access denied", "robot check", "captcha", "verify you are human", ) _USERNAME_RE = re.compile(r"^[a-zA-Z0-9._]{1,30}$") _SCRIPT_TAG_RE = re.compile(r"[^>]*)>(?P.*?)", re.IGNORECASE | re.DOTALL) _JSON_PARSE_RE = re.compile(r"JSON\.parse$\"((?:\\.|[^\"\\])+)\"$") _HTML_TAG_RE = re.compile(r"<[^>]+>") _STYLE_BLOCK_RE = re.compile(r"]*>.*?", re.IGNORECASE | re.DOTALL) _HTML_COMMENT_RE = re.compile(r"", re.DOTALL) _POST_PATH_RE = re.compile(r"^/(?:p|reel|tv)/[A-Za-z0-9_-]+/?$") def build_url(parameters=None, start_url=None): if isinstance(start_url, str) and start_url.strip(): return _normalize_post_url(start_url) if isinstance(parameters, dict): post_url = parameters.get("post_url") if isinstance(post_url, str) and post_url.strip(): return _normalize_post_url(post_url) return "" def execute( client, parameters=None, max_pages=None, follow_url_filter=None, start_url=None, on_page=None, should_cancel=None, ): rows = [] errors = [] pages_attempted = 0 pages_succeeded = 0 initial_url = build_url(parameters=parameters, start_url=start_url) if not initial_url: errors.append("input_diagnostic: missing required parameter post_url.") return { "rows": rows, "schema_id": SCHEMA_ID, "pages_attempted": pages_attempted, "pages_succeeded": pages_succeeded, "errors": errors, } page_limit = _resolve_page_limit(max_pages) debug_capture = _strict_bool_flag( parameters.get("__debug_capture_screenshots") if isinstance(parameters, dict) else None ) source_post_url = initial_url seen_page_urls = set() next_page_url = initial_url last_pagination_state = { "marker_detected": False, "stop_reason": "", } while next_page_url and pages_attempted < page_limit: if callable(should_cancel): try: if should_cancel(): errors.append( "runtime_diagnostic: execution cancelled before processing next page." ) break except Exception as exc: errors.append(f"runtime_diagnostic: should_cancel callback failed: {exc}") current_url = next_page_url next_page_url = "" if current_url in seen_page_urls: errors.append( f"page_diagnostic: pagination loop detected at {current_url}; traversal stopped." ) break seen_page_urls.add(current_url) pages_attempted += 1 options = { "wait_until": "networkidle", "wait_ms": 1200, "timeout_ms": 45000, "screenshot": debug_capture, } try: response = client.download_html(url=current_url, options=options) except Exception as exc: errors.append(f"page_diagnostic: download_html failed for {current_url}: {exc}") continue html = response.html if isinstance(response.html, str) else "" title = response.title if isinstance(response.title, str) else "" screenshot_url = "" if debug_capture and isinstance(response.screenshot_url, str): screenshot_url = response.screenshot_url pages_succeeded += 1 source_post_url = _resolve_post_url_from_html( html=html, fallback_post_url=source_post_url, ) page_rows, parse_meta = _extract_rows_from_page( html=html, page_url=current_url, output_post_url=source_post_url, pagination_base_url=source_post_url, ) marker_hits = _detect_markers(html=html, title=title, has_rows=bool(page_rows)) if marker_hits: errors.append( f"page_diagnostic: blocked or placeholder markers detected on {current_url}: " + ", ".join(marker_hits) ) page_new_rows = list(page_rows) rows.extend(page_rows) next_candidate = parse_meta.get("next_page_url", "") marker_detected = bool(parse_meta.get("pagination_marker_detected")) if next_candidate: next_candidate = _coerce_next_page_url( next_candidate=next_candidate, source_post_url=source_post_url, ) next_candidate = _normalize_follow_url(current_url, next_candidate) valid_target, invalid_reason = _is_valid_pagination_target( current_url=current_url, next_url=next_candidate, source_post_url=source_post_url, ) if not valid_target: marker_detected = True last_pagination_state["marker_detected"] = True last_pagination_state["stop_reason"] = invalid_reason errors.append( f"page_diagnostic: rejected pagination target outside post context on {current_url}: " f"{next_candidate} ({invalid_reason})" ) elif not _pagination_progresses(current_url, next_candidate): marker_detected = True last_pagination_state["marker_detected"] = True last_pagination_state["stop_reason"] = "next_page_url did not advance pagination state" errors.append( f"page_diagnostic: pagination marker on {current_url} yielded a non-progressing next_page_url: {next_candidate}" ) elif _is_follow_blocked(next_candidate, follow_url_filter): marker_detected = True last_pagination_state["marker_detected"] = True last_pagination_state["stop_reason"] = "follow_url_filter blocked next page" errors.append( f"page_diagnostic: pagination marker on {current_url} blocked by follow_url_filter: {next_candidate}" ) elif next_candidate in seen_page_urls: marker_detected = True last_pagination_state["marker_detected"] = True last_pagination_state["stop_reason"] = "next page already visited" errors.append( f"page_diagnostic: pagination marker on {current_url} points to an already visited page: {next_candidate}" ) else: next_page_url = next_candidate last_pagination_state["marker_detected"] = True last_pagination_state["stop_reason"] = "" elif marker_detected and pages_attempted < page_limit: last_pagination_state["marker_detected"] = True last_pagination_state["stop_reason"] = "pagination marker found but next_page_url missing" errors.append( f"page_diagnostic: pagination marker detected on {current_url} but next_page_url was not extractable." ) if callable(on_page): event = { "page_url": current_url, "page_index": pages_attempted, "page_rows": page_new_rows, } if debug_capture: event["page_screenshot_url"] = screenshot_url try: on_page(event) except Exception as exc: errors.append(f"runtime_diagnostic: on_page callback failed on {current_url}: {exc}") if pages_succeeded > 0 and not rows: errors.append( "parse_diagnostic: page downloads succeeded but no comment rows were extracted." ) errors.extend(_validate_rows(rows)) if ( last_pagination_state.get("marker_detected") and not next_page_url and pages_attempted < page_limit and last_pagination_state.get("stop_reason") ): errors.append( "page_diagnostic: pagination traversal stopped early while marker was present " f"({last_pagination_state['stop_reason']})." ) return { "rows": rows, "schema_id": SCHEMA_ID, "pages_attempted": pages_attempted, "pages_succeeded": pages_succeeded, "errors": errors, } def _resolve_page_limit(max_pages): if max_pages is None: return MAX_PAGES try: value = int(max_pages) except Exception: return MAX_PAGES if value <= 0: return MAX_PAGES return min(value, MAX_PAGES) def _strict_bool_flag(value): if value is True: return True if isinstance(value, str): return value.strip().lower() in {"1", "true", "yes", "on"} return False def _normalize_post_url(url): url = (url or "").strip() if not url: return "" parsed = urlparse(url) if not parsed.scheme: parsed = urlparse("https://" + url) scheme = parsed.scheme or "https" netloc = parsed.netloc.lower() path = parsed.path or "/" if not path.endswith("/"): path = path + "/" query_pairs = [] for key, value in parse_qsl(parsed.query, keep_blank_values=False): if key in {"hl", "igshid", "igsh", "utm_source", "utm_medium"}: continue query_pairs.append((key, value)) normalized = urlunparse((scheme, netloc, path, "", urlencode(query_pairs), "")) return normalized def _normalize_follow_url(current_url, candidate_url): if not candidate_url: return "" resolved = urljoin(current_url, candidate_url) parsed = urlparse(resolved) query_pairs = [(k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=False)] return urlunparse( ( parsed.scheme or "https", parsed.netloc, parsed.path, "", urlencode(query_pairs), "", ) ) def _is_follow_blocked(url, follow_url_filter): if follow_url_filter is None: return False try: if callable(follow_url_filter): return not bool(follow_url_filter(url)) except Exception: return True if isinstance(follow_url_filter, str): return follow_url_filter not in url pattern = getattr(follow_url_filter, "search", None) if callable(pattern): try: return pattern(url) is None except Exception: return True return False def _detect_markers(html, title, has_rows=False): visible_text = _extract_visible_text(html) text = _clean_whitespace(f"{title} {visible_text}").lower() has_content = has_rows or _has_content_markers(html=html, visible_text=visible_text) looks_placeholder = _looks_like_placeholder_page(title=title, visible_text=visible_text) markers = [] for marker in _HARD_BLOCKED_MARKERS: if marker in text and marker not in markers: if not has_content or looks_placeholder: markers.append(marker) for marker in _SOFT_BLOCKED_MARKERS: if marker not in text or marker in markers: continue if marker == "new tab": if looks_placeholder and not has_content: markers.append(marker) continue if not has_content or looks_placeholder: markers.append(marker) return markers def _has_content_markers(html, visible_text): html_lc = (html or "").lower() text_lc = (visible_text or "").lower() html_markers = ( "edge_media_to_parent_comment", "edge_media_to_comment", "xdt_shortcode_media", '"comment_count"', "/c/", "listing-card", "data-listing-id", "search-results", "results-list", ) if any(marker in html_lc for marker in html_markers): return True if re.search(r"\b\d+\s+comments?\b", text_lc): return True if re.search(r"@[a-z0-9._]{1,30}", text_lc) and re.search( r"\b(comment|reply|replies)\b", text_lc ): return True return False def _looks_like_placeholder_page(title, visible_text): title_lc = _clean_whitespace(title).lower() text_lc = _clean_whitespace(visible_text).lower() if title_lc in {"new tab", "access denied", "robot check"} and len(text_lc) < 220: return True if len(text_lc) < 160 and any(marker in text_lc for marker in _PAGE_BLOCKED_MARKERS): return True return False def _extract_rows_from_page(html, page_url, output_post_url, pagination_base_url): post_url = _normalize_post_url(output_post_url or page_url) pagination_url = _normalize_post_url(pagination_base_url or page_url) json_roots = _collect_json_roots(html) rows = [] for root in json_roots: extracted = _extract_rows_from_json_root(root, post_url=post_url) rows.extend(extracted) if not rows: dom_rows = _extract_rows_from_dom(html=html, post_url=post_url) rows.extend(dom_rows) next_page_url, pagination_marker = _extract_next_page_url( html=html, current_url=pagination_url, json_roots=json_roots, ) return rows, { "next_page_url": next_page_url, "pagination_marker_detected": pagination_marker, } def _collect_json_roots(html): roots = [] serialized_seen = set() for match in _SCRIPT_TAG_RE.finditer(html): attrs = match.group("attrs") or "" body = (match.group("body") or "").strip() if not body: continue for root in _json_candidates_from_script(attrs, body): marker = _json_fingerprint(root) if marker in serialized_seen: continue serialized_seen.add(marker) roots.append(root) return roots def _json_candidates_from_script(attrs, body): candidates = [] lower_attrs = attrs.lower() lower_body = body.lower() if "application/json" in lower_attrs or body[:1] in "[{": parsed = _safe_json_loads(body) if parsed is not None: candidates.append(parsed) for marker in ( "window._sharedData", "window.__additionalDataLoaded", "window.__initialDataLoaded", "window.__initialData", ): extracted = _extract_json_after_marker(body, marker) if extracted is not None: parsed = _safe_json_loads(extracted) if parsed is not None: candidates.append(parsed) for m in _JSON_PARSE_RE.finditer(body): try: decoded = bytes(m.group(1), "utf-8").decode("unicode_escape") except Exception: continue parsed = _safe_json_loads(decoded) if parsed is not None: candidates.append(parsed) if "has_next_page" in lower_body and "end_cursor" in lower_body: snippet = _extract_balanced_from_token(body, "{", "}") parsed = _safe_json_loads(snippet) if snippet else None if parsed is not None: candidates.append(parsed) return candidates def _extract_json_after_marker(text, marker): idx = text.find(marker) if idx == -1: return None brace_idx = text.find("{", idx) bracket_idx = text.find("[", idx) start_idx = -1 open_ch = "" close_ch = "" if brace_idx != -1 and (bracket_idx == -1 or brace_idx < bracket_idx): start_idx = brace_idx open_ch = "{" close_ch = "}" elif bracket_idx != -1: start_idx = bracket_idx open_ch = "[" close_ch = "]" if start_idx == -1: return None return _extract_balanced_segment(text, start_idx, open_ch, close_ch) def _extract_balanced_from_token(text, open_ch, close_ch): start_idx = text.find(open_ch) if start_idx == -1: return None return _extract_balanced_segment(text, start_idx, open_ch, close_ch) def _extract_balanced_segment(text, start_idx, open_ch, close_ch): depth = 0 in_string = False escape = False for i in range(start_idx, len(text)): ch = text[i] if in_string: if escape: escape = False elif ch == "\\": escape = True elif ch == '"': in_string = False continue if ch == '"': in_string = True continue if ch == open_ch: depth += 1 elif ch == close_ch: depth -= 1 if depth == 0: return text[start_idx : i + 1] return None def _safe_json_loads(raw): if not isinstance(raw, str): return None candidate = raw.strip() if not candidate: return None if candidate.endswith(";"): candidate = candidate[:-1] try: return json.loads(candidate) except Exception: return None def _json_fingerprint(value): try: return json.dumps(value, sort_keys=True, separators=(",", ":")) except Exception: return str(type(value)) def _extract_rows_from_json_root(root, post_url): rows = [] stack = [(root, "")] while stack: current, path = stack.pop() if isinstance(current, dict): row = _row_from_comment_node(node=current, path=path, post_url=post_url) if row: rows.append(row) for key, value in current.items(): child_path = f"{path}.{key}" if path else str(key) if isinstance(value, (dict, list)): stack.append((value, child_path)) elif isinstance(current, list): for idx, value in enumerate(current): child_path = f"{path}[{idx}]" if path else f"[{idx}]" if isinstance(value, (dict, list)): stack.append((value, child_path)) return rows def _row_from_comment_node(node, path, post_url): path_lc = path.lower() comment_identity_keys = ( "is_ranked_comment", "did_report_as_spam", "comment_like_count", "child_comment_count", "parent_comment_id", ) commentish = ( "comment" in path_lc or "thread" in path_lc or any(k in node for k in comment_identity_keys) ) if not commentish: return None comment_id = _extract_comment_id(node=node, path_lc=path_lc) if not comment_id: return None comment_text = _extract_comment_text(node) if not comment_text: return None author_username = _extract_author_username(node) if not author_username: return None published_at = _extract_published_at(node) return { "comment_id": comment_id, "post_url": post_url, "author_username": author_username, "comment_text": comment_text, "published_at": published_at, } def _extract_comment_id(node, path_lc=""): for key in ("comment_id", "comment_pk", "pk"): if key in node: val = _clean_whitespace(str(node.get(key, ""))) if _looks_like_comment_id(val): return val raw_id = _clean_whitespace(str(node.get("id", ""))) if raw_id and ("comment" in path_lc or "thread" in path_lc): if _looks_like_comment_id(raw_id): return raw_id permalink = node.get("permalink") or node.get("url") if isinstance(permalink, str): match = re.search(r"/c/(\d+)", permalink) if match: return match.group(1) return "" def _looks_like_comment_id(value): if re.fullmatch(r"\d{5,}", value): return True if re.fullmatch(r"[A-Za-z0-9_\-]{12,}", value): return True return False def _extract_comment_text(node): direct_candidates = [] for key in ("text", "comment_text", "body"): if key in node and isinstance(node.get(key), str): direct_candidates.append(node.get(key)) for value in direct_candidates: cleaned = _clean_whitespace(value) if cleaned and not _is_noise_text(cleaned) and not _has_placeholder_marker(cleaned): return cleaned for key in ("content", "comment"): val = node.get(key) if isinstance(val, dict): for child_key in ("text", "body"): child_value = val.get(child_key) if isinstance(child_value, str): cleaned = _clean_whitespace(child_value) if cleaned and not _is_noise_text(cleaned) and not _has_placeholder_marker(cleaned): return cleaned return "" def _extract_author_username(node): for key in ("author", "owner", "user"): value = node.get(key) if isinstance(value, dict): username = value.get("username") if isinstance(username, str): normalized = _normalize_username(username) if normalized: return normalized username = node.get("username") if isinstance(username, str): normalized = _normalize_username(username) if normalized: return normalized return "" def _extract_published_at(node): for key in ( "created_at", "created_time", "createdAt", "timestamp", "taken_at", "date", "published_at", ): if key in node: normalized = _normalize_datetime(node.get(key)) if normalized: return normalized for key in ("time", "meta"): value = node.get(key) if isinstance(value, dict): nested = _extract_published_at(value) if nested: return nested return "" def _extract_rows_from_dom(html, post_url): rows = [] for match in re.finditer(r"/(?:p|reel|tv)/[A-Za-z0-9_-]+/c/(?P\d+)", html): cid = match.group("cid") start = max(0, match.start() - 2500) end = min(len(html), match.end() + 1200) window = html[start:end] username = _extract_username_from_window(window) comment_text = _extract_comment_text_from_window(window, username) published_at = _extract_datetime_from_window(window) if not cid or not username or not comment_text: continue row = { "comment_id": cid, "post_url": post_url, "author_username": username, "comment_text": comment_text, "published_at": published_at, } rows.append(row) return rows def _extract_username_from_window(window): username_matches = list( re.finditer(r'href="/(?P[A-Za-z0-9._]{1,30})/?(?:[?#][^"]*)?"', window) ) for candidate in reversed(username_matches): username = _normalize_username(candidate.group("username")) if username and username.lower() not in {"explore", "accounts", "reels", "direct"}: return username return "" def _extract_comment_text_from_window(window, username): best = "" for match in re.finditer(r"]*>(.*?)", window, flags=re.IGNORECASE | re.DOTALL): text = _clean_html_fragment(match.group(1)) if not text: continue if username and text.lower() == username.lower(): continue if _is_noise_text(text): continue if _has_placeholder_marker(text): continue best = text break return best def _extract_datetime_from_window(window): match = re.search(r']*datetime="([^"]+)"', window, flags=re.IGNORECASE) if not match: return "" return _normalize_datetime(match.group(1)) def _clean_html_fragment(fragment): if not fragment: return "" without_tags = _HTML_TAG_RE.sub(" ", fragment) decoded = unescape(without_tags) return _clean_whitespace(decoded) def _is_noise_text(text): token = text.strip().lower() if not token: return True if len(token) <= 2: return True if re.fullmatch(r"view all \d+ comments?", token): return True if re.fullmatch(r"\d+ comments?", token): return True if re.fullmatch(r"\d+ replies?", token): return True noisy = { "like", "likes", "reply", "replies", "see translation", "view replies", "more", "follow", "following", "verified", "share", "send", "log in", "sign up", } return token in noisy def _has_placeholder_marker(value): token = _clean_whitespace(value).lower() if not token: return False return any(marker in token for marker in _FIELD_PLACEHOLDER_MARKERS) def _normalize_username(value): value = _clean_whitespace(value) if not value: return "" if value.startswith("@"): value = value[1:] return value if _USERNAME_RE.fullmatch(value) else "" def _normalize_datetime(value): if value is None: return "" if isinstance(value, (int, float)): ts = float(value) if ts > 1_000_000_000_000: ts = ts / 1000.0 if ts <= 0: return "" try: dt = datetime.fromtimestamp(ts, tz=timezone.utc) except Exception: return "" return dt.replace(microsecond=0).isoformat().replace("+00:00", "Z") value = _clean_whitespace(str(value)) if not value: return "" if re.fullmatch(r"\d{10,13}", value): return _normalize_datetime(int(value)) if value.endswith("Z"): return value if re.fullmatch(r"\d{4}-\d{2}-\d{2}([ T]\d{2}:\d{2}(:\d{2})?)?", value): return value.replace(" ", "T") return value def _extract_next_page_url(html, current_url, json_roots): marker_detected = False next_link_match = re.search( r']+rel=["\']next["\'][^>]+href=["\']([^"\']+)["\']', html, flags=re.IGNORECASE, ) if next_link_match: return next_link_match.group(1), True for root in json_roots: candidate, marker = _next_from_json_root(root, current_url) marker_detected = marker_detected or marker if candidate: return candidate, True text_lc = html.lower() if '"has_next_page":true' in text_lc or "has_next_page\\\":true" in text_lc: marker_detected = True cursor = _extract_cursor_from_html(html) if cursor: return _with_query(current_url, {"cursor": cursor}), True return "", marker_detected def _next_from_json_root(root, current_url): marker_detected = False stack = [root] while stack: current = stack.pop() if isinstance(current, dict): page_info = current.get("page_info") if isinstance(page_info, dict): has_next = bool(page_info.get("has_next_page")) marker_detected = marker_detected or has_next if has_next: cursor = page_info.get("end_cursor") or page_info.get("next_cursor") if isinstance(cursor, str) and cursor: return _with_query(current_url, {"cursor": cursor}), True paging = current.get("paging") if isinstance(paging, dict): next_url = paging.get("next") if isinstance(next_url, str) and next_url: return next_url, True if isinstance(current.get("next_max_id"), str) and current.get("next_max_id"): marker_detected = True return _with_query(current_url, {"max_id": current.get("next_max_id")}), True if isinstance(current.get("next_page_url"), str) and current.get("next_page_url"): return current.get("next_page_url"), True for value in current.values(): if isinstance(value, (dict, list)): stack.append(value) elif isinstance(current, list): for value in current: if isinstance(value, (dict, list)): stack.append(value) return "", marker_detected def _extract_cursor_from_html(html): patterns = ( r'"end_cursor"\s*:\s*"([^"]+)"', r"\\\"end_cursor\\\"\s*:\s*\\\"([^\\]+)\\\"", r'"next_max_id"\s*:\s*"([^"]+)"', ) for pattern in patterns: match = re.search(pattern, html) if match: return match.group(1) return "" def _with_query(url, params): parsed = urlparse(url) existing = dict(parse_qsl(parsed.query, keep_blank_values=False)) existing.update({k: v for k, v in params.items() if v is not None and v != ""}) return urlunparse( (parsed.scheme or "https", parsed.netloc, parsed.path, "", urlencode(existing), "") ) def _extract_visible_text(html): if not html: return "" text = _SCRIPT_TAG_RE.sub(" ", html) text = _STYLE_BLOCK_RE.sub(" ", text) text = _HTML_COMMENT_RE.sub(" ", text) text = _HTML_TAG_RE.sub(" ", text) return _clean_whitespace(unescape(text)) def _resolve_post_url_from_html(html, fallback_post_url): if not html: return _normalize_post_url(fallback_post_url) candidates = [] canonical_match = re.search( r']+rel=["\']canonical["\'][^>]+href=["\']([^"\']+)["\']', html, flags=re.IGNORECASE, ) if canonical_match: candidates.append(canonical_match.group(1)) og_match = re.search( r']+property=["\']og:url["\'][^>]+content=["\']([^"\']+)["\']', html, flags=re.IGNORECASE, ) if og_match: candidates.append(og_match.group(1)) for candidate in candidates: normalized = _normalize_post_url(candidate) if _is_instagram_host(urlparse(normalized).netloc): return normalized return _normalize_post_url(fallback_post_url) def _coerce_next_page_url(next_candidate, source_post_url): if not next_candidate: return "" source_parsed = urlparse(source_post_url) candidate_parsed = urlparse(next_candidate) if not _is_instagram_host(source_parsed.netloc): return next_candidate if not candidate_parsed.netloc or candidate_parsed.netloc == source_parsed.netloc: return next_candidate candidate_query = dict(parse_qsl(candidate_parsed.query, keep_blank_values=False)) if "cursor" in candidate_query and candidate_query["cursor"]: return _with_query(source_post_url, {"cursor": candidate_query["cursor"]}) if "max_id" in candidate_query and candidate_query["max_id"]: return _with_query(source_post_url, {"max_id": candidate_query["max_id"]}) return next_candidate def _is_instagram_host(netloc): host = (netloc or "").lower() return host == "instagram.com" or host.endswith(".instagram.com") def _is_valid_pagination_target(current_url, next_url, source_post_url): candidate = urlparse(next_url) current = urlparse(current_url) source = urlparse(source_post_url) if not _is_instagram_host(candidate.netloc): return False, "next page host is not instagram" if not _same_instagram_host_family(candidate.netloc, source.netloc): return False, "next page host does not match source host family" expected_path = _normalized_path(source.path) candidate_path = _normalized_path(candidate.path) if not _POST_PATH_RE.fullmatch(candidate_path): return False, "next page path is not a post/reel/tv URL" if expected_path and candidate_path != expected_path: return False, "next page path changed outside the source post" if current.path and candidate_path != _normalized_path(current.path): return False, "next page path changed from current post path" return True, "" def _same_instagram_host_family(host_a, host_b): return _instagram_host_family(host_a) == _instagram_host_family(host_b) def _instagram_host_family(host): host = (host or "").strip().lower() if host.startswith("www."): host = host[4:] return host def _normalized_path(path): value = path or "/" if not value.endswith("/"): value = value + "/" return value def _pagination_progresses(current_url, next_url): current = urlparse(current_url) candidate = urlparse(next_url) if _normalized_page_identity(current) != _normalized_page_identity(candidate): return True current_tokens = _pagination_tokens(current) candidate_tokens = _pagination_tokens(candidate) if current_tokens != candidate_tokens: return True current_cmp_qs = _comparison_query_params(current) candidate_cmp_qs = _comparison_query_params(candidate) if current_cmp_qs != candidate_cmp_qs: return True return False def _normalized_page_identity(parsed): path = parsed.path or "/" if not path.endswith("/"): path = path + "/" host = parsed.netloc.lower() if _is_instagram_host(host): host = _instagram_host_family(host) return ((parsed.scheme or "https").lower(), host, path) def _pagination_tokens(parsed): qs = dict(parse_qsl(parsed.query, keep_blank_values=False)) return ( qs.get("cursor", ""), qs.get("max_id", ""), qs.get("end_cursor", ""), qs.get("next_max_id", ""), ) def _comparison_query_params(parsed): ignored = { "x-amz-algorithm", "x-amz-credential", "x-amz-date", "x-amz-expires", "x-amz-signedheaders", "x-amz-signature", "x-amz-security-token", } pairs = [] for key, value in parse_qsl(parsed.query, keep_blank_values=False): if key.lower() in ignored: continue pairs.append((key, value)) return tuple(sorted(pairs)) def _clean_whitespace(value): if value is None: return "" value = str(value).replace("\u00a0", " ") value = re.sub(r"\s+", " ", value) return value.strip() def _validate_rows(rows): diagnostics = [] if not rows: return diagnostics required = ("comment_id", "post_url", "author_username", "comment_text") for field in required: empty_count = 0 for row in rows: val = _clean_whitespace(row.get(field, "")) if not val: empty_count += 1 if empty_count > 0: diagnostics.append( f"field_diagnostic: required field '{field}' empty in {empty_count}/{len(rows)} rows." ) for field in ("author_username", "comment_text"): noisy_count = 0 for row in rows: val = _clean_whitespace(row.get(field, "")).lower() if not val: continue if any(marker in val for marker in _FIELD_PLACEHOLDER_MARKERS): noisy_count += 1 if noisy_count > 0: diagnostics.append( f"field_diagnostic: placeholder/noisy values found in '{field}' for {noisy_count}/{len(rows)} rows." ) published_non_empty = 0 for row in rows: if _clean_whitespace(row.get("published_at", "")): published_non_empty += 1 if rows and (published_non_empty / float(len(rows))) < 0.70: diagnostics.append( "field_diagnostic: published_at non-empty ratio below 0.70; timestamp coverage is low." ) return diagnostics