import cloudscraper import time import json import csv import logging import signal import sys import os import re from bs4 import BeautifulSoup from requests.exceptions import RequestException from functools import partial import config # 定义基础 URL 和可变参数 host_url = "https://www.iafd.com" astr_base_url = f"{host_url}/astrology.rme/sign=" astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces'] birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}" ethnic_url = f"{host_url}/lookupethnic.rme/ethnic=" ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian'] distributors_list_url = f'{host_url}/distrib.asp' distributors_base_url = f"{host_url}/distrib.rme/distrib=" studios_list_url = f"{host_url}/studio.asp" studios_base_url = f"{host_url}/studio.rme/studio=" # 设置 headers 和 scraper headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } scraper = cloudscraper.create_scraper() #使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): for attempt in range(max_retries): try: if host_url not in url.lower(): logging.error(f'wrong url format: {url}') return None response = scraper.get(url, headers=headers) response.raise_for_status() # 处理 HTTP 错误 # 预处理 HTML（如果提供了 preprocessor） html_text = preprocessor(response.text) if preprocessor else response.text soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 return soup logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") except cloudscraper.exceptions.CloudflareChallengeError as e: logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...") except cloudscraper.exceptions.CloudflareCode1020 as e: logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...") except Exception as e: logging.error(f"Unexpected error on {url}: {e}, Retring...") logging.error(f'Fetching failed after max retries. {url}') return None # 达到最大重试次数仍然失败 # 修复 HTML 结构，去除多余标签并修正标签，在获取人种的时候需要 def preprocess_html(html): return html.replace('
', '').replace('= 5: title = cols[0].text.strip() label = cols[1].text.strip() year = cols[2].text.strip() rev = cols[3].text.strip() a_href = cols[0].find('a') href = host_url + a_href['href'] if a_href else '' list_data.append({ 'title': title, 'label': label, 'year': year, 'rev': rev, 'href': href }) return list_data, next_url # 解析作品列表，有个人出演，也有导演的 def parse_credits_table(table, distributor_list): # 找到thead并跳过 thead = table.find('thead') if thead: thead.decompose() # 去掉thead部分，不需要解析 # 现在只剩下tbody部分 tbody = table.find('tbody') rows = tbody.find_all('tr') if tbody else [] movies = [] distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数 # rows = table.find_all('tr', class_='we') for row in rows: cols = row.find_all('td') if len(cols) >= 6: title = cols[0].text.strip() href_a = cols[0].find('a') href = href_a['href'] if href_a else '' year = cols[1].text.strip() distributor = cols[2].text.strip().lower() notes = cols[3].text.strip() rev = cols[4].text.strip() formats = cols[5].text.strip() for key in distributor_list: if key in distributor: distributor_count[key] += 1 movies.append({ 'title': title, 'href' : href, 'year': year, 'distributor': distributor, 'notes': notes, 'rev': rev, 'formats': formats }) return movies, distributor_count # 请求网页并提取所需数据 def parse_page_performer(soup): # 提取数据 data = {} # 定义我们需要的字段名称和HTML中对应的标签 fields = { 'performer_aka': 'Performer AKA', 'birthday': 'Birthday', 'astrology': 'Astrology', 'birthplace': 'Birthplace', 'gender': 'Gender', 'years_active': 'Years Active', 'ethnicity': 'Ethnicity', 'nationality': 'Nationality', 'hair_colors': 'Hair Colors', 'eye_color': 'Eye Color', 'height': 'Height', 'weight': 'Weight', 'measurements': 'Measurements', 'tattoos': 'Tattoos', 'piercings': 'Piercings' } reversed_map = {v: k for k, v in fields.items()} # 解析表格数据, 获取参演或者导演的列表 role_list = ['personal', 'directoral'] distributor_list = ['vixen', 'blacked', 'tushy', 'x-art'] credits_list = {} # 使用字典来存储统计 distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数 for role in role_list: table = soup.find('table', id=role) if table : movies, stat_map = parse_credits_table(table, distributor_list) credits_list[role] = movies # 更新 distributor 统计 for distributor in distributor_list: distributor_count[distributor] += stat_map.get(distributor, 0) # 统计 movies 数量 #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role]) movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, [])) # 如果没有找到 if len(credits_list) == 0 : logging.warning(f"movie table empty. url: {url} ") # 遍历每个 bioheading, 获取metadata bioheadings = soup.find_all('p', class_='bioheading') for bio in bioheadings: heading = bio.text.strip() biodata = None # 如果包含 "Performer",需要特殊处理 if 'Performer' in heading: heading = 'Performer AKA' biodata_div = bio.find_next('div', class_='biodata') if biodata_div: div_text = biodata_div.get_text(separator='|').strip() biodata = [b.strip() for b in div_text.split('|') if b.strip()] else: biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else '' # 保存数据 if heading in reversed_map: kkey = reversed_map[heading] data[kkey] = biodata # 添加统计数据到 data data['movies_cnt'] = movies_cnt data['vixen_cnt'] = distributor_count['vixen'] data['blacked_cnt'] = distributor_count['blacked'] data['tushy_cnt'] = distributor_count['tushy'] data['x_art_cnt'] = distributor_count['x-art'] data['credits'] = credits_list return data # 解析网页 HTML 并提取电影信息 def parse_page_movie(soup, href, title): # 解析电影基础信息 movie_data = {} info_div = soup.find("div", class_="col-xs-12 col-sm-3") if info_div: labels = info_div.find_all("p", class_="bioheading") values = info_div.find_all("p", class_="biodata") for label, value in zip(labels, values): key = label.text.strip() val = value.text.strip() if key in ["Distributor", "Studio", "Director"]: link = value.find("a") if link: val = link.text.strip() movie_data[f'{key}Href'] = host_url + link['href'] movie_data[key] = val else: return None # 解析演职人员信息 performers = [] cast_divs = soup.find_all("div", class_="castbox") for cast in cast_divs: performer = {} link = cast.find("a") if link: performer["name"] = link.text.strip() performer["href"] = host_url + link["href"] performer["tags"] = [ tag.strip() for br in cast.find_all("br") if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip() ] #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()] performers.append(performer) # 解析场景拆解 scene_breakdowns = [] scene_table = soup.find("div", id="sceneinfo") if scene_table: rows = scene_table.find_all("tr") for row in rows: cols = row.find_all("td") if len(cols) >= 2: scene = cols[0].text.strip() # 场景编号 performer_info = cols[1] # 包含表演者及链接信息 # 获取
之前的完整 HTML（保留标签等格式） performer_html = str(performer_info) # 获取所有HTML内容 split_html = performer_html.split("
") # 按
进行分割 if split_html: performers_html = split_html[0].strip() # 取
之前的部分 else: split_html = performer_html.split("
") # 按
进行分割 if split_html: performers_html = split_html[0].strip() # 取
之前的部分 else: performers_html = performer_html.strip() # 如果没有
，取全部 # 解析为纯文本（去除HTML标签，仅提取文本内容） performers_soup = BeautifulSoup(performers_html, "html.parser") performers_text = performers_soup.get_text() # 提取表演者 scene_performers = [p.strip() for p in performers_text.split(",")] # 尝试获取 `webscene` 和 `studio` links_data = {} links = performer_info.find_all("a") if links: webscene_title = links[0].text.strip() if len(links)>0 else None webscene = links[0]["href"] if len(links)>0 else None studio = links[1].text.strip() if len(links)>1 else None studio_lnk = links[1]["href"] if len(links)>1 else None links_data = { "title": webscene_title, "webscene": webscene, "studio": studio, "studio_lnk": studio_lnk, } scene_data = { "scene": scene, "performers": scene_performers, **links_data, } scene_breakdowns.append(scene_data) appears_in = [] appears_divs = soup.find("div", id="appearssection") if appears_divs: rows = appears_divs.find_all("li") for row in rows: lnk = row.find("a") if lnk: appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']}) return { "href": href, "title": title, "Minutes": movie_data.get("Minutes", ""), "Distributor": movie_data.get("Distributor", ""), "Studio": movie_data.get("Studio", ""), "ReleaseDate": movie_data.get("Release Date", ""), "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""), "All-Girl": movie_data.get("All-Girl", ""), "All-Male": movie_data.get("All-Male", ""), "Compilation": movie_data.get("Compilation", ""), "Webscene": movie_data.get("Webscene", ""), "Director": movie_data.get("Director", ""), "DirectorHref": movie_data.get("DirectorHref", ""), "DistributorHref": movie_data.get("DistributorHref", ""), "StudioHref": movie_data.get("StudioHref", ""), "Performers": performers, "SceneBreakdowns": scene_breakdowns, "AppearsIn": appears_in, } if __name__ == "__main__": for astro in astro_list: url = astr_base_url + astro next_url = url logging.info(f"Fetching data for {astro}, url {url} ...") while True: soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id")) if soup: list_data, next_url = parse_page_astro(soup, astro) if list_data: print(list_data[0] if len(list_data)>0 else 'no data') break else: logging.info(f"Retrying {next_url} ...") time.sleep(5) # 等待后再重试 time.sleep(2) # 控制访问频率