This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
2025-07-24 19:13:56 +08:00

636 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
#import config
#import utils
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
distributors_list_url = f'{host_url}/distrib.asp'
distributors_base_url = f"{host_url}/distrib.rme/distrib="
studios_list_url = f"{host_url}/studio.asp"
studios_base_url = f"{host_url}/studio.rme/studio="
ethnic_list_url = f'{host_url}/advsearch.asp'
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
http_code_404 = 404
http_code_login = 401
http_code_url = 601
http_code_local = 99
save_raw_html = True
load_from_local = False
def common_parser(html, page, **kwargs):
parser = "lxml" if page=='ethnic' else "html.parser"
soup = BeautifulSoup(html, parser)
if not soup:
return None
if page == 'astro':
#parse_page_astro(soup, astro):
return parse_page_astro(soup, **kwargs)
elif page == 'birth':
#parse_page_birth(soup, month, day):
return parse_page_birth(soup, **kwargs)
elif page == 'ethnic':
#parse_page_ethnic(soup, ethnic):
return parse_page_ethnic(soup, **kwargs)
elif page == 'dist':
return parse_page_dist_stu(soup,'distable')
elif page == 'stu':
return parse_page_dist_stu(soup,'studio')
elif page == 'actor':
#parse_page_performer(soup, url):
return parse_page_performer(soup, **kwargs)
elif page == 'movies':
#parse_page_movie(soup, href, title)
return parse_page_movie(soup, **kwargs)
else:
logging.warning(f"wrong page: {page}")
return None
'''
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
if load_from_local: # 从本地读取的逻辑
html = utils.read_raw_html(url)
if html:
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(html) if preprocessor else html
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, http_code_local # 返回一个小于100的错误码表明是从本地返回的
for attempt in range(max_retries):
try:
if host_url not in url.lower():
logging.error(f'wrong url format: {url}')
return None, http_code_url
response = scraper.get(url, headers=headers)
# 处理 HTTP 状态码
if response.status_code == 404:
logging.debug(f"Page not found (404): {url}")
return None, http_code_404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误
# 过期的网页与404相同处理
if "invalid or outdated page" in response.text.lower():
logging.debug(f"invalid or outdated page: {url}")
return None, http_code_404 # 直接返回 404调用方可以跳过
if save_raw_html:
utils.write_raw_html(url, response.text)
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, response.status_code
else:
# 检查是否发生跳转,比如到登录页面
if response.history:
logging.warning(f"Page redirected on {url}. Validation failed.")
return None, http_code_login
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except cloudscraper.exceptions.CloudflareChallengeError as e:
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
except cloudscraper.exceptions.CloudflareCode1020 as e:
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retring...")
logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败
'''
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
def preprocess_html(html):
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 检查电影信息是否存在
def movie_validator(soup, table_id):
return soup.find("table", id=table_id) is not None
# 解析 HTML 内容,提取需要的数据
def parse_page_ethnic_list(soup, href):
div_root = soup.find("select", id="ethnicity1")
if not div_root:
logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
return None, None
list_data = []
# 提取所有的 <option> 标签
options = div_root.find_all('option')
if options:
# 解析并输出 value 和文本内容
for option in options:
href = option.get('value', None)
text = option.text.strip()
if href and href.lower() == 'none':
continue
list_data.append({
"name": text,
"href": host_url + href if href else ''
})
return list_data
# 解析 HTML 内容,提取需要的数据
def parse_page_astro(soup, astro):
astro_div = soup.find("div", id="astro")
if not astro_div:
logging.warning(f"Warning: No 'astro' div found in {astro}")
return None, None
flag = False
list_cnt = 0
list_data = []
next_url = None
birth_date = None
for elem in astro_div.find_all(recursive=False):
if elem.name == "h3" and "astroday" in elem.get("class", []):
birth_date = elem.get_text(strip=True)
elif elem.name == "div" and "perficon" in elem.get("class", []):
a_tag = elem.find("a")
if a_tag:
href = host_url + a_tag["href"]
name = a_tag.find("span", class_="perfname")
if name:
list_data.append({
"astrology": astro,
"birth_date": birth_date,
"person": name.get_text(strip=True),
"href": href
})
flag = True
list_cnt = list_cnt +1
if flag:
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
return list_data, next_url
else:
return None, None
# 解析页面内容并更新birth_map
def parse_page_birth(soup, month, day):
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
if not datarows:
return None, None
flag = False
list_cnt = 0
list_data = []
next_url = None
rows = datarows[0].find_all('div', class_='col-sm-4')
for row in rows:
link_tag = row.find('a')
person = link_tag.text.strip() if link_tag else ''
href = link_tag['href'] if link_tag else ''
href = host_url + href
# 如果 href 已经在 birth_map 中,跳过
flag = True
if any(entry['href'] == href for entry in list_data):
continue
# 将数据添加到 birth_map
list_data.append({
'month': month,
'day': day,
'person': person,
'href': href
})
list_cnt = list_cnt +1
if flag:
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
return list_data, next_url
else:
return None, None
# 解析 HTML 内容,提取需要的数据
def parse_page_ethnic(soup, ethnic):
rows = soup.find_all('div', class_='row headshotrow')
flag = False
list_data = []
next_url = None
for row in rows:
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
link_tag = col.find('a')
img_tag = col.find('div', class_='pictag')
flag = True
if link_tag and img_tag:
href = host_url + link_tag['href']
person = img_tag.text.strip()
# 将数据存储到 ethnic_map
list_data.append({
'ethnic': ethnic,
'person': person,
'href': href
})
if flag:
logging.debug(f"get {len(list_data)} persons from this page.")
next_page = soup.find('a', rel='next')
if next_page:
next_url = host_url + next_page['href']
logging.debug(f"Found next page: {next_url}")
return list_data, next_url
else:
logging.debug(f"All pages fetched for {ethnic}.")
return list_data, None
else:
return None, None
# 解析列表页
def parse_page_dist_stu_list(soup, select_name):
list_data = []
next_url = None
select_element = soup.find('select', {'name': select_name})
if select_element :
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
list_data.append({
'name' : text,
'href' : str(value)
})
return list_data, next_url
else:
return None, None
# 解析 HTML 内容,提取需要的数据
def parse_page_dist_stu(soup, table_id):
table = soup.find("table", id=table_id)
if not table:
logging.warning(f"Warning: No {table_id} table found ")
return None, None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
list_data = []
next_url = None
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
list_data.append({
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return list_data, next_url
# 解析 作品列表,有个人出演,也有导演的
def parse_credits_table(table, distributor_list):
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
movies = []
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
# rows = table.find_all('tr', class_='we')
for row in rows:
#tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串
tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串
cols = row.find_all('td')
if len(cols) >= 6:
title = cols[0].text.strip()
href_a = cols[0].find('a')
href = href_a['href'] if href_a else ''
year = cols[1].text.strip()
distributor = cols[2].text.strip().lower()
href_d = cols[2].find('a')
href_dist = host_url + href_d['href'] if href_d else ''
notes = cols[3].text.strip()
rev = cols[4].text.strip()
formats = cols[5].text.strip()
for key in distributor_list:
if key in distributor:
distributor_count[key] += 1
movies.append({
'title': title,
'href' : href,
'year': year,
'distributor': distributor,
'distributor_href': href_dist,
'notes': notes,
'rev': rev,
'formats': formats,
'tr_class': tr_class
})
return movies, distributor_count
# 请求网页并提取所需数据
def parse_page_performer(soup, url):
# 提取数据
data = {}
# 定义我们需要的字段名称和HTML中对应的标签
fields = {
'performer_aka': 'Performer AKA',
'birthday': 'Birthday',
'astrology': 'Astrology',
'birthplace': 'Birthplace',
'gender': 'Gender',
'years_active': 'Years Active',
'ethnicity': 'Ethnicity',
'nationality': 'Nationality',
'hair_colors': 'Hair Colors',
'eye_color': 'Eye Color',
'height': 'Height',
'weight': 'Weight',
'measurements': 'Measurements',
'tattoos': 'Tattoos',
'piercings': 'Piercings'
}
reversed_map = {v: k for k, v in fields.items()}
# 解析表格数据, 获取参演或者导演的列表
role_list = ['personal', 'directoral']
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
credits_list = {}
# 使用字典来存储统计
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
for role in role_list:
table = soup.find('table', id=role)
if table :
movies, stat_map = parse_credits_table(table, distributor_list)
credits_list[role] = movies
# 更新 distributor 统计
for distributor in distributor_list:
distributor_count[distributor] += stat_map.get(distributor, 0)
# 统计 movies 数量
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
# 如果没有找到
if len(credits_list) == 0 :
logging.warning(f"movie table empty. url: {url} ")
# 遍历每个 bioheading, 获取metadata
bioheadings = soup.find_all('p', class_='bioheading')
for bio in bioheadings:
heading = bio.text.strip()
biodata = None
# 如果包含 "Performer",需要特殊处理
if 'Performer' in heading:
heading = 'Performer AKA'
biodata_div = bio.find_next('div', class_='biodata')
if biodata_div:
div_text = biodata_div.get_text(separator='|').strip()
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
else:
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
# 保存数据
if heading in reversed_map:
kkey = reversed_map[heading]
data[kkey] = biodata
# 添加统计数据到 data
data['movies_cnt'] = movies_cnt
data['vixen_cnt'] = distributor_count['vixen']
data['blacked_cnt'] = distributor_count['blacked']
data['tushy_cnt'] = distributor_count['tushy']
data['x_art_cnt'] = distributor_count['x-art']
data['credits'] = credits_list
return data
# 解析网页 HTML 并提取电影信息
def parse_page_movie(soup, href, title):
# 解析电影基础信息
movie_data = {}
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values):
key = label.text.strip()
if key == "Directors": # 解析多位导演的情况
directors = []
links = value.find_all("a")
for link in links:
director_name = link.text.strip()
director_href = host_url + link['href'] if link['href'] else ''
directors.append({"name": director_name, "href": director_href})
movie_data[key] = directors
else:
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else:
return None
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")
for cast in cast_divs:
performer = {}
link = cast.find("a")
if link:
performer["name"] = link.text.strip()
performer["href"] = host_url + link["href"]
#performer["tags"] = [
# tag.strip() for br in cast.find_all("br")
# if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
#]
tags = []
for br in cast.find_all("br"):
tag = br.next_sibling
if isinstance(tag, str) and tag.strip():
tags.append(tag.strip())
performer["tags"] = tags
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
performers.append(performer)
# 解析场景拆解
scene_breakdowns = []
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip() # 场景编号
performer_info = cols[1] # 包含表演者及链接信息
# 获取 <br> 之前的完整 HTML保留 <i> 标签等格式)
performer_html = str(performer_info) # 获取所有HTML内容
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
split_html = performer_html.split("<br>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
performers_html = performer_html.strip() # 如果没有 <br>,取全部
# 解析为纯文本去除HTML标签仅提取文本内容
performers_soup = BeautifulSoup(performers_html, "html.parser")
performers_text = performers_soup.get_text()
# 提取表演者
scene_performers = [p.strip() for p in performers_text.split(",")]
# 尝试获取 `webscene` 和 `studio`
links_data = {}
links = performer_info.find_all("a")
if links:
webscene_title = links[0].text.strip() if len(links)>0 else None
webscene = links[0]["href"] if len(links)>0 else None
studio = links[1].text.strip() if len(links)>1 else None
studio_lnk = links[1]["href"] if len(links)>1 else None
links_data = {
"title": webscene_title,
"webscene": webscene,
"studio": studio,
"studio_lnk": studio_lnk,
}
scene_data = {
"scene": scene,
"performers": scene_performers,
**links_data,
}
scene_breakdowns.append(scene_data)
appears_in = []
appears_divs = soup.find("div", id="appearssection")
if appears_divs:
rows = appears_divs.find_all("li")
for row in rows:
lnk = row.find("a")
if lnk:
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Directors": movie_data.get("Directors", []), # 可能存在的元素
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
}
if __name__ == "__main__":
for astro in astro_list:
url = astr_base_url + astro
next_url = url
logging.info(f"Fetching data for {astro}, url {url} ...")
while True:
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
if soup:
list_data, next_url = parse_page_astro(soup, astro)
if list_data:
print(list_data[0] if len(list_data)>0 else 'no data')
break
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
time.sleep(2) # 控制访问频率