modify scripts

This commit is contained in:
2025-07-24 19:13:56 +08:00
parent cc6530d73a
commit 50d829364b
10 changed files with 1289 additions and 128 deletions

View File

@ -133,8 +133,9 @@ fi
# 每月任务
if [ "${PERIOD}" = "--monthly" ]; then
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
register_spider "pbox" "scrapy crawl javhd -a mod='update' "
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
register_spider "javhd" "scrapy crawl javhd -a mod='update' "
register_spider "lord" "scrapy crawl lord -a mod='update' "
fi

View File

@ -6,6 +6,7 @@ from datetime import datetime
from typing import List, Dict
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
import scrapy_proj.comm.comm_def as comm
from scrapy_proj.utils.utils import pretty_json_simple
# 注册器字典
spider_handler_registry = {}
@ -609,3 +610,61 @@ class JavHDDBHandler(SQLiteDBHandler):
except sqlite3.Error as e:
logging.error(f"query error: {e}")
return 0
@register_handler(comm.SPIDER_NAME_LORD)
class LordDBHandler(SQLiteDBHandler):
def __init__(self, db_path=shared_db_path):
super().__init__(db_path)
self.tbl_name_actors = 'thelordofporn_actress'
self.tbl_name_alias = 'thelordofporn_alias'
def insert_item(self, item):
if item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL:
self.insert_actor(item)
else:
logging.error(f"unkown item.")
return item
def insert_actor(self, item):
actor_id = self.insert_or_update_common(item, self.tbl_name_actors, uniq_key='href', exists_do_nothing=False)
if actor_id:
for alias in item.get('alias', []):
alias_data = {'actress_id':actor_id, 'alias':alias}
affected_rows = self.insert_or_update_with_composite_pk(data=alias_data, tbl_name=self.tbl_name_alias, composite_pk=['actress_id','alias'], exists_do_nothing=False)
if affected_rows:
logging.debug(f"insert/update actress_alias. data: {alias_data}")
else:
logging.warning(f"insert actor alias error!. data: {alias_data}")
else:
logging.warning(f"insert actor data error! data: {pretty_json_simple(item)}")
# 统计函数
def get_stat(self):
try:
self.cursor.execute(f"""
SELECT
(SELECT COUNT(*) FROM {self.tbl_name_actors}) AS actor_cnt
""")
row = self.cursor.fetchone()
if not row:
logging.warning(f"query no results.")
return {}
columns = [desc[0] for desc in self.cursor.description]
return dict(zip(columns, row))
except sqlite3.Error as e:
logging.error(f"query error: {e}")
return {}
def has_full_data(self, href):
try:
self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_actors} WHERE is_full_data=1 and href = ?", (href,))
row = self.cursor.fetchone()
return row[0] if row else None
except sqlite3.Error as e:
logging.error(f"query error: {e}")
return 0

View File

@ -188,6 +188,68 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
logging.error(f"Error inserting or updating data: {e}")
return None
def insert_or_update_with_composite_pk(self, data, tbl_name, composite_pk, exists_do_nothing=True):
"""
针对联合主键表执行插入或更新操作
:param table_name: 表名
:param data: 字典类型,待插入或更新的数据
:param composite_pk: 列表类型,联合主键字段名集合
:param need_update: 布尔值记录存在时是否更新默认True
:return: 操作影响的行数
"""
try:
# 校验联合主键参数有效性
if not isinstance(composite_pk, list) or len(composite_pk) < 2:
logging.error(f"联合主键必须是包含至少两个字段的列表: {composite_pk}")
return None
processed_data = self.check_and_process_data(data, tbl_name)
# 校验联合主键字段是否都在数据中存在
for pk_field in composite_pk:
if pk_field not in processed_data:
logging.error(f"联合主键字段 '{pk_field}' 未在数据中提供")
return None
# 构建查询条件
where_conditions = " AND ".join([f"{pk} = ?" for pk in composite_pk])
pk_values = [processed_data[pk] for pk in composite_pk]
# 检查记录是否存在
self.cursor.execute(
f"SELECT 1 FROM {tbl_name} WHERE {where_conditions}",
pk_values
)
exists = self.cursor.fetchone() is not None
if exists:
if exists_do_nothing:
return 0
# 构建更新字段(排除联合主键字段)
update_fields = [f for f in processed_data.keys() if f not in composite_pk]
if not update_fields:
return 0
set_clause = ", ".join([f"{field} = ?" for field in update_fields])
update_values = [processed_data[field] for field in update_fields] + pk_values
# 执行更新兼容低版本SQLite的标准语法
update_sql = f"UPDATE {tbl_name} SET {set_clause} WHERE {where_conditions}"
self.cursor.execute(update_sql, update_values)
return 1
else:
# 执行插入操作
columns = ", ".join(processed_data.keys())
placeholders = ", ".join(["?" for _ in processed_data.keys()])
insert_sql = f"INSERT INTO {tbl_name} ({columns}) VALUES ({placeholders})"
self.cursor.execute(insert_sql, list(processed_data.values()))
return 2
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
def get_id_by_key(self, tbl, uniq_key, val):
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
row = self.cursor.fetchone()

View File

@ -192,4 +192,33 @@ class JavHDActorItem(scrapy.Item):
birth_date = scrapy.Field()
ethnicity = scrapy.Field()
birth_place = scrapy.Field()
is_full_data = scrapy.Field()
is_full_data = scrapy.Field()
class LordActorItem(scrapy.Item):
item_type = scrapy.Field()
pornstar = scrapy.Field()
rating = scrapy.Field()
rank = scrapy.Field()
votes = scrapy.Field()
href = scrapy.Field()
career_start = scrapy.Field()
measurements = scrapy.Field()
born = scrapy.Field()
height = scrapy.Field()
weight = scrapy.Field()
date_modified = scrapy.Field()
global_rank = scrapy.Field()
weekly_rank = scrapy.Field()
last_month_rating = scrapy.Field()
current_rating = scrapy.Field()
total_votes = scrapy.Field()
birth_date = scrapy.Field()
birth_year = scrapy.Field()
birth_place = scrapy.Field()
height_ft = scrapy.Field()
height_cm = scrapy.Field()
weight_lbs = scrapy.Field()
weight_kg = scrapy.Field()
is_full_data = scrapy.Field()
alias = scrapy.Field()

View File

@ -31,7 +31,7 @@ class BaseSpider(scrapy.Spider):
yield request
def parse(self, response):
"""统一的响应处理入口"""
"""统一的响应处理入口,实际上没有起作用,因为直接走了 scrapy.Request 里的 callback """
# 记录请求耗时
request_time = response.meta.get('request_time')
if request_time:

View File

@ -1,15 +1,19 @@
import scrapy
import re
import sys
from urllib.parse import urljoin, quote_plus
from scrapy_proj.spiders.base_spider import BaseSpider
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
from scrapy_proj.spiders.parser.iafd_parser import common_parser
from scrapy_proj.utils.utils import pretty_json_simple
db_tools = IAFDDBHandler()
class IAFDSpider(BaseSpider):
name = SPIDER_NAME_IAFD
allowed_domains = ["iafd.com"]
allowed_domains = ["iafd.com", "www.iafd.com"]
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
@ -19,10 +23,10 @@ class IAFDSpider(BaseSpider):
studios_list_url = f"{host_url}/studio.asp"
ethnic_list_url = f'{host_url}/advsearch.asp'
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.update = int(update)
self.update_mode = True if mod and mod.lower() == 'update' else False
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
self.cmd_astro = 'astro'
@ -64,8 +68,9 @@ class IAFDSpider(BaseSpider):
query_args = {}
if self.debug:
query_args['limit'] = 5
if self.update == 0:
if self.update_mode:
query_args['is_full_data'] = 0
query_args['is_full_data'] = 404
# 读取待更新的演员列表
if self.cmd_performers in self.cmd_list:
@ -77,7 +82,7 @@ class IAFDSpider(BaseSpider):
href = item.get('href', '')
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'})
# 读取待更新的影片列表
if self.cmd_movies in self.cmd_list:
@ -88,7 +93,7 @@ class IAFDSpider(BaseSpider):
for item in movies:
href = item.get('href', '')
self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'})
def start_astro(self):
@ -113,50 +118,28 @@ class IAFDSpider(BaseSpider):
yield request
def parse_astro_page(self, response):
astro = response.meta['astro']
astro_div = response.css('div#astro')
if astro_div:
birth_date = None
for elem in astro_div.css('*'):
if elem.css('h3.astroday'):
birth_date = elem.css('h3.astroday::text').get().strip()
elif elem.css('div.perficon'):
a_tag = elem.css('a')
if a_tag:
href = self.host_url + a_tag.attrib['href']
name = a_tag.css('span.perfname::text').get()
if name:
item = IAFDPersonItem()
item['name'] = name
item['href'] = href
item['from_astro_list'] = 1
item['from_birth_list'] = 0
item['from_ethnic_list'] = 0
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
astro = response.meta.get('astro', '')
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
else:
self.logger.warning(f"parse data error. {response.url}")
item = IAFDPersonDetailItem()
#yield item
def parse_birth_page(self, response):
month = response.meta['month']
day = response.meta['day']
datarows = response.css('div.col-sm-12.col-lg-9')
if datarows:
rows = datarows[0].css('div.col-sm-4')
for row in rows:
link_tag = row.css('a')
person = link_tag.css('::text').get().strip() if link_tag else ''
href = self.host_url + link_tag.attrib['href'] if link_tag else ''
item = IAFDPersonItem()
item['name'] = person
item['href'] = href
item['from_astro_list'] = 0
item['from_birth_list'] = 1
item['from_ethnic_list'] = 0
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
else:
self.logger.warning(f"parse data error. {response.url}")
item = IAFDPersonDetailItem()
#yield item
def parse_ethnic_list_page(self, response):
div_root = response.css('select#ethnicity1')
if div_root:
@ -167,40 +150,25 @@ class IAFDSpider(BaseSpider):
href = option.attrib.get('value')
text = option.css('::text').get().strip()
if href and href.lower() != 'none':
ethnic_url = self.host_url + href
ethnic_url = urljoin(response.url , href)
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
if self.debug:
break
def parse_ethnic_page(self, response):
ethnic = response.meta['ethnic']
rows = response.css('div.row.headshotrow')
for row in rows:
cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
for col in cols:
link_tag = col.css('a')
img_tag = col.css('div.pictag')
if link_tag and img_tag:
href = self.host_url + link_tag.attrib['href']
person = img_tag.css('::text').get().strip()
item = IAFDPersonItem()
item['name'] = person
item['href'] = href
item['from_astro_list'] = 0
item['from_birth_list'] = 0
item['from_ethnic_list'] = 1
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
next_page = response.css('a[rel="next"]')
if next_page:
next_url = self.host_url + next_page.attrib['href']
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
else:
self.crawler.stats.inc_value(f"{self.name}/ethnic_done")
self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}")
self.logger.warning(f"parse data error. {response.url}")
if next_url:
self.logger.info(f"find next page: {next_url}")
else:
self.logger.info(f"found all pages. url: {response.url}")
item = IAFDPersonDetailItem()
#yield item
def parse_distributors_list_page(self, response):
select_element = response.css('select[name="Distrib"]')
@ -209,16 +177,8 @@ class IAFDSpider(BaseSpider):
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
dis_url = self.host_url + f"/distrib.rme/distrib={value}"
item = IAFDMovieItem()
item['title'] = text
item['href'] = dis_url
item['release_year'] = 0
item['from_performer_list'] = 0
item['from_dist_list'] = 1
item['from_stu_list'] = 0
yield item
#yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
def parse_studios_list_page(self, response):
select_element = response.css('select[name="Studio"]')
@ -227,47 +187,54 @@ class IAFDSpider(BaseSpider):
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
stu_url = self.host_url + f"/studio.rme/studio={value}"
item = IAFDMovieItem()
item['title'] = text
item['href'] = stu_url
item['release_year'] = 0
item['from_performer_list'] = 0
item['from_dist_list'] = 0
item['from_stu_list'] = 1
yield item
#yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
dis_url = f"{self.host_url}/studio.rme/studio={value}"
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
def parse_stu_dist_page(self, response):
list_type = response.meta.get('list_type', '')
data, next_url = common_parser(html=response.text, page=list_type)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
else:
self.logger.warning(f"fetched data error. {response.url}")
item = IAFDPersonDetailItem()
#yield item
def parse_person_detail_page(self, response):
data = common_parser(html=response.text, page='actor', url=response.url)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
else:
self.logger.warning(f"fetched data error. {response.url}")
item = IAFDPersonDetailItem()
item['href'] = response.url
item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中
# 解析其他详细信息,根据实际页面结构修改
item['gender'] = response.css('span.gender::text').get()
item['birthday'] = response.css('span.birthday::text').get()
item['astrology'] = response.css('span.astrology::text').get()
item['birthplace'] = response.css('span.birthplace::text').get()
item['years_active'] = response.css('span.years_active::text').get()
item['ethnicity'] = response.css('span.ethnicity::text').get()
item['nationality'] = response.css('span.nationality::text').get()
item['hair_colors'] = response.css('span.hair_colors::text').get()
item['eye_color'] = response.css('span.eye_color::text').get()
item['height'] = response.css('span.height::text').get()
item['weight'] = response.css('span.weight::text').get()
item['measurements'] = response.css('span.measurements::text').get()
item['tattoos'] = response.css('span.tattoos::text').get()
item['piercings'] = response.css('span.piercings::text').get()
item['movies_cnt'] = response.css('span.movies_cnt::text').get()
item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
item['performer_aka'] = response.css('span.performer_aka::text').getall()
yield item
#yield item
def parse_movie_detail_page(self, response):
title = response.meta.get('title', '')
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
else:
self.logger.warning(f"fetched data error. {response.url}")
item = IAFDMovieDetailItem()
item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中
item['href'] = response.url
# 解析其他详细信息,根据实际页面结构修改
yield item
#yield item
def custom_block_check(self, response):
item_type = response.meta.get('item_type', '')
if "invalid or outdated page" in response.text.lower():
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
return "invalid or outdated page"
else:
self.logger.info(f"right content. url: {response.url}")
return None
# 处理页面异常主要是404, 403
def handle_blocked(self, response, reason):
item_type = response.meta.get('item_type', '')
if response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")

View File

@ -111,7 +111,7 @@ class JavhdSpider(BaseSpider):
item['rank'] = rank
item['url'] = url
item[f'{lang}_name'] = name
#TODO: 非英语的页面,要去更新对应的名字
# 非英语的页面,要去更新对应的名字
if lang != 'en':
item['url'] = replace_lang_param(item['url'])
yield item
@ -127,7 +127,7 @@ class JavhdSpider(BaseSpider):
meta={"list_item": item} # 传递列表页数据到详情页
)
else:
self.logger.info(f"actor(name) has full data. skip. url: {url}")
self.logger.info(f"actor({name}) has full data. skip. url: {url}")
# 获取下一页
next_path = data.get("pagination_params", {}).get("next")

View File

@ -0,0 +1,399 @@
import scrapy
import sys
import re
from urllib.parse import urljoin, quote_plus
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param, pretty_json_simple
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
from scrapy_proj.items import LordActorItem
from scrapy_proj.comm.comm_def import SPIDER_NAME_LORD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL
from scrapy_proj.db_wapper.spider_db_handler import LordDBHandler
db_tools = LordDBHandler()
class LordSpider(BaseSpider):
name = SPIDER_NAME_LORD
allowed_domains = ["www.thelordofporn.com", "thelordofporn.com"]
# 配置请求头复用curl中的头部信息
custom_settings = {
"DEFAULT_REQUEST_HEADERS": {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"if-modified-since": "Wed, 23 Jul 2025 14:34:28 GMT",
"priority": "u=0, i",
"sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0"
},
"COOKIES_ENABLED": True # 启用Cookie支持
}
def __init__(self, debug='false', mod='update', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.update_mod = False if mod and mod.lower() == 'force' else True
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
# 入口函数,由基类的方法触发
def custom_start_requests(self):
url = 'https://thelordofporn.com/pornstars/'
yield scrapy.Request(
url=url,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
callback=self.parse_list,
meta={} # 传递列表页数据到详情页
)
def parse_list(self, response):
# 提取所有演员条目对应原代码中的article.loop-item
articles = response.css("article.loop-item")
self.logger.info(f"当前页({response.url})找到 {len(articles)} 个演员条目")
for article in articles:
try:
# 提取演员名称和详情页链接
title_tag = article.css("h3.loop-item__title a")
title = title_tag.css("::text").get(default="N/A").strip()
href = title_tag.attrib.get("href") # 获取a标签的href属性
# 提取评分
rating = article.css("div.loop-item__rating::text").get(default="N/A").strip()
# 提取排名和投票数对应原代码中的meta_tags
meta_tags = article.css("div.loop-item__rank span")
rank = None
votes = None
# 解析排名第一个span中的b标签
if len(meta_tags) >= 1:
rank_b = meta_tags[0].css("b::text").get()
rank = rank_b.strip() if rank_b else "N/A"
# 解析投票数第二个span中的b标签
if len(meta_tags) >= 2:
votes_b = meta_tags[1].css("b::text").get()
votes = votes_b.strip() if votes_b else "N/A"
# 转换为数值类型模拟原代码中的utils.parse_numeric
def parse_numeric(value):
if not value or value == "N/A":
return None
# 移除非数字字符(如逗号、%等)
numeric_str = ''.join(filter(str.isdigit, value))
return int(numeric_str) if numeric_str else None
# 构建演员数据字典
actress_data = {
"pornstar": title,
"rating": parse_numeric(rating),
"rank": parse_numeric(rank),
"votes": parse_numeric(votes),
"href": href if href else None
}
# 发起详情查询
actor_exists = 0 if not self.update_mod else db_tools.has_full_data(href)
if actor_exists < 1 :
yield scrapy.Request(
url=href,
callback=self.parse_actor_detail,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
meta = {'actor':actress_data}
)
else:
self.logger.info(f"actor({title}) has full data. skip. url: {href}")
except Exception as e:
self.logger.error(f"解析演员条目失败: {e}, 页面: {response.url}")
continue # 跳过错误条目,继续解析下一个
# 提取下一页链接(对应原代码中的.next.page-numbers
next_page_url = None
next_page_tag = response.css(".nav-links .next.page-numbers")
if next_page_tag:
next_page_href = next_page_tag.attrib.get("href")
if next_page_href and not self.debug:
# 拼接完整URL处理相对路径
next_page_url = urljoin(response.url, next_page_href)
yield scrapy.Request(
url=next_page_url,
callback=self.parse_list,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
meta = {}
)
else:
self.logger.info(f"已解析所有页面, current url: {response.url}")
def parse_actor_detail(self, response):
# 1. 定义字段映射表:页面原始字段 -> Item字段
FIELD_MAPPING = {
# 基本信息
'date_modified': 'date_modified',
# 排名信息
'Global Rank': 'global_rank',
'Weekly Rank': 'weekly_rank',
# 评分信息
'Last Month': 'last_month_rating',
'Rating Av.': 'current_rating',
'Total of Votes': 'total_votes',
# 详细属性
'Career start': 'career_start',
'Measurements': 'measurements',
'Born': 'born',
'Height': 'height',
'Weight': 'weight',
'Name': 'alias_raw', # 别名对应Name字段
# 解析后字段(出生/身高/体重)
'birth_date': 'birth_date',
'birth_year': 'birth_year',
'birth_place': 'birth_place',
'height_ft': 'height_ft',
'height_cm': 'height_cm',
'weight_lbs': 'weight_lbs',
'weight_kg': 'weight_kg',
'alias':'alias'
}
# 2. 初始化原始数据容器
raw_data = {}
# 3. 提取基础信息
raw_data['href'] = response.url
entry_header = response.css("header.entry-header")
raw_data['name'] = entry_header.css("h1.entry-title::text").get(default="").strip()
raw_data['date_modified'] = entry_header.css("time[itemprop='dateModified']::attr(content)").get(default="").strip()
# 4. 提取排名信息
for item in entry_header.css("div.porn-star-rank__item"):
item_text = item.css("::text").get(default="").strip()
raw_data[item_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
# 5. 提取评分和投票信息
for item in response.css("div.specifications__item--horizontal"):
# 1. 精准定位标题区域排除b标签
# 情况1有子div的结构如Rating Av.带img
title_div = item.css("div:first-child")
if title_div:
# 只提取子div内的文本自动排除同级的b标签
title_parts = title_div.css("::text").getall()
else:
# 情况2和3无子div的结构Last Month和Total of Votes
# 提取当前item内所有文本但排除b标签的内容
all_text_parts = item.css("::text").getall()
b_text_parts = item.css("b::text").getall()
# 从所有文本中移除b标签的文本
title_parts = [t for t in all_text_parts if t not in b_text_parts]
# 2. 清理标题文本(处理非断空格和空白)
title_text = "".join(title_parts)
title_text = title_text.replace(u'\xa0', u' ') # 替换非断空格
title_text = re.sub(r'\s+', ' ', title_text).strip() # 合并空白
raw_data[title_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
# 6. 提取详细属性specifications-grid-row
for row in response.css("div.specifications-grid-row"):
items = row.css("div.specifications-grid-item")
for i in [0, 1]: # 处理每行2个属性
if i < len(items):
label = extract_text_from_element(items[i].css("h5"))
value = extract_text_from_element(items[i].css("span"))
if label:
raw_data[label] = value
# 7. 处理特殊字段(别名需要清洗)
raw_data['alias'] = self.clean_alias(raw_data.get("Name", ""))
# 9. 解析出生信息、身高、体重并合并
raw_data.update(self.parse_birth_info(raw_data.get("Born", "")))
raw_data.update(self.parse_height(raw_data.get("Height", "")))
raw_data.update(self.parse_weight(raw_data.get("Weight", "")))
# 10. 映射到Item并返回
item = LordActorItem()
item['item_type'] = ITEM_TYPE_ACTOR_DETAIL
actor_data = response.meta['actor']
for k, v in actor_data.items():
if k in item.fields:
item[k] = v
for raw_field, item_field in FIELD_MAPPING.items():
if item_field in item.fields:
item[item_field] = raw_data.get(raw_field, "")
# 标记为完整数据
item['is_full_data'] = 1
self.logger.info(f"actor data: {raw_data}, meta: {response.meta['actor']}, item: {pretty_json_simple(item)}")
yield item
# 保留原工具函数需作为Spider类的方法
def parse_birth_info(self, text):
match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text, re.IGNORECASE)
if match:
return {
"birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
"birth_year": match.group(3),
"birth_place": match.group(4),
}
return {"birth_date": text, "birth_year": "", "birth_place": ""}
def parse_height2(self, text):
match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text, re.IGNORECASE)
if match:
height_ft = f"{match.group(1)}'{match.group(2)}\""
return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
return {"height_ft": text, "height_cm": ""}
def parse_height(self, text):
# 统一预处理:替换逗号为小数点,处理常见笔误(如'n'→'in'
text = text.replace(',', '.').replace(' n ', ' in ').strip()
# 正则表达式:匹配所有英尺+英寸格式(支持多种表达方式)
# 分组说明:
# 1. 英尺数值 2. 英尺单位feet/ft/ft./' 3. 英寸数值 4. 英寸单位inches/in/in./inch/"
# 5. 厘米/米数值 6. 单位cm/m
pattern = r"""
# 情况1先英尺英寸后厘米/米(主流格式)
(?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分如5ft/5'
(?:and\s*)? # 可选的"and"如5 feet and 2 inches
(\d+)\s*(inches|in\.?|inch|")?\s* # 英寸部分如2in/2"
(?:\(?(\d+\.?\d*)\s*(cm|m)\)?) # 厘米/米部分(如(157cm)/(1.57m)
| # 或
# 情况2先厘米后英尺英寸如170 cm / 5 feet and 7 inches
(\d+)\s*cm\s*/\s* # 厘米在前
(?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分
(?:and\s*)?
(\d+)\s*(inches|in\.?|inch|")? # 英寸部分
| # 或
# 情况3纯简写格式如5'3" (160 cm)
(\d+)'(\d+)"\s*\(?(\d+)\s*cm\)? # 5'3"格式
"""
# 使用VERBOSE忽略正则中的空格IGNORECASE忽略大小写
match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
if not match:
# 处理纯厘米格式(如"160cm"
cm_match = re.match(r'(\d+)\s*cm', text, re.IGNORECASE)
if cm_match:
return {"height_ft": "", "height_cm": cm_match.group(1)}
return {"height_ft": text, "height_cm": ""}
# 提取匹配结果(根据不同情况处理分组)
ft = None
inch = None
cm = None
# 情况1先英尺英寸后厘米/米
if match.group(1) and match.group(3):
ft = match.group(1)
inch = match.group(3)
num = match.group(5)
unit = match.group(6).lower() if match.group(6) else 'cm'
# 情况2先厘米后英尺英寸
elif match.group(7):
cm = match.group(7)
ft = match.group(8)
inch = match.group(10)
unit = 'cm' # 情况2中前面的单位固定为cm
# 情况3纯简写格式5'3"
elif match.group(11) and match.group(12):
ft = match.group(11)
inch = match.group(12)
cm = match.group(13)
unit = 'cm'
# 处理厘米/米转换(米转厘米)
if not cm and num and unit:
if unit == 'm':
cm = str(int(float(num) * 100)) # 1.57m → 157cm
else:
cm = num # 直接使用cm数值
# 格式化英尺英寸表达式如5'2"
height_ft = f"{ft}'{inch}\"" if ft and inch else ""
return {"height_ft": height_ft.strip(), "height_cm": cm.strip() if cm else ""}
def parse_weight2(self, text):
match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text, re.IGNORECASE)
if match:
return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
return {"weight_lbs": text, "weight_kg": ""}
def parse_weight(self, text):
# 预处理:清理空格和常见格式问题
text = text.strip().replace(' ', ' ')
# 正则表达式:匹配多种体重格式
# 分组说明:
# 1. 磅数值 2. 磅单位(lb/lbs/pounds) 3. 千克数值 4. 千克单位(kg)
# 5. 千克在前的数值 6. 千克单位 7. 磅在后的数值 8. 磅单位
pattern = r"""
# 情况1磅在前千克在后主流格式
(?:(\d+)\s*(lb|lbs|pounds)?\s*) # 磅部分支持lb/lbs/pounds或省略单位
(?:\(?\s*(\d+)\s*(kg)\s*\)?) # 千克部分(如(45 kg)
| # 或
# 情况2千克在前磅在后如52 kg / 114 lbs
(?:(\d+)\s*(kg)\s*/\s*) # 千克部分
(\d+)\s*(lb|lbs|pounds)? # 磅部分
"""
# 使用VERBOSE和IGNORECASE标志增强兼容性
match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
if not match:
# 尝试匹配纯千克格式(如"52kg"
kg_match = re.match(r'(\d+)\s*kg', text, re.IGNORECASE)
if kg_match:
return {"weight_lbs": "", "weight_kg": kg_match.group(1)}
# 尝试匹配纯磅格式(如"114lb"
lb_match = re.match(r'(\d+)\s*(lb|lbs|pounds)', text, re.IGNORECASE)
if lb_match:
return {"weight_lbs": lb_match.group(1), "weight_kg": ""}
# 完全无法解析的情况
return {"weight_lbs": text, "weight_kg": ""}
# 提取匹配结果(根据不同情况处理分组)
weight_lbs = None
weight_kg = None
# 情况1磅在前千克在后
if match.group(1) and match.group(3):
weight_lbs = match.group(1)
weight_kg = match.group(3)
# 情况2千克在前磅在后
elif match.group(5) and match.group(6):
weight_kg = match.group(5)
weight_lbs = match.group(7)
return {
"weight_lbs": weight_lbs.strip() if weight_lbs else "",
"weight_kg": weight_kg.strip() if weight_kg else ""
}
def clean_alias(self, alias):
alias = re.sub(r'\(Age \d+\)', '', alias, re.IGNORECASE)
return [name.strip() for name in alias.split(',') if name.strip()]
def parse_numeric(self, value):
try:
return float(value)
except (ValueError, TypeError):
return 0

View File

@ -0,0 +1,636 @@
import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
#import config
#import utils
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
distributors_list_url = f'{host_url}/distrib.asp'
distributors_base_url = f"{host_url}/distrib.rme/distrib="
studios_list_url = f"{host_url}/studio.asp"
studios_base_url = f"{host_url}/studio.rme/studio="
ethnic_list_url = f'{host_url}/advsearch.asp'
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
http_code_404 = 404
http_code_login = 401
http_code_url = 601
http_code_local = 99
save_raw_html = True
load_from_local = False
def common_parser(html, page, **kwargs):
parser = "lxml" if page=='ethnic' else "html.parser"
soup = BeautifulSoup(html, parser)
if not soup:
return None
if page == 'astro':
#parse_page_astro(soup, astro):
return parse_page_astro(soup, **kwargs)
elif page == 'birth':
#parse_page_birth(soup, month, day):
return parse_page_birth(soup, **kwargs)
elif page == 'ethnic':
#parse_page_ethnic(soup, ethnic):
return parse_page_ethnic(soup, **kwargs)
elif page == 'dist':
return parse_page_dist_stu(soup,'distable')
elif page == 'stu':
return parse_page_dist_stu(soup,'studio')
elif page == 'actor':
#parse_page_performer(soup, url):
return parse_page_performer(soup, **kwargs)
elif page == 'movies':
#parse_page_movie(soup, href, title)
return parse_page_movie(soup, **kwargs)
else:
logging.warning(f"wrong page: {page}")
return None
'''
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
if load_from_local: # 从本地读取的逻辑
html = utils.read_raw_html(url)
if html:
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(html) if preprocessor else html
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, http_code_local # 返回一个小于100的错误码表明是从本地返回的
for attempt in range(max_retries):
try:
if host_url not in url.lower():
logging.error(f'wrong url format: {url}')
return None, http_code_url
response = scraper.get(url, headers=headers)
# 处理 HTTP 状态码
if response.status_code == 404:
logging.debug(f"Page not found (404): {url}")
return None, http_code_404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误
# 过期的网页与404相同处理
if "invalid or outdated page" in response.text.lower():
logging.debug(f"invalid or outdated page: {url}")
return None, http_code_404 # 直接返回 404调用方可以跳过
if save_raw_html:
utils.write_raw_html(url, response.text)
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, response.status_code
else:
# 检查是否发生跳转,比如到登录页面
if response.history:
logging.warning(f"Page redirected on {url}. Validation failed.")
return None, http_code_login
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except cloudscraper.exceptions.CloudflareChallengeError as e:
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
except cloudscraper.exceptions.CloudflareCode1020 as e:
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retring...")
logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败
'''
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
def preprocess_html(html):
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 检查电影信息是否存在
def movie_validator(soup, table_id):
return soup.find("table", id=table_id) is not None
# 解析 HTML 内容,提取需要的数据
def parse_page_ethnic_list(soup, href):
div_root = soup.find("select", id="ethnicity1")
if not div_root:
logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
return None, None
list_data = []
# 提取所有的 <option> 标签
options = div_root.find_all('option')
if options:
# 解析并输出 value 和文本内容
for option in options:
href = option.get('value', None)
text = option.text.strip()
if href and href.lower() == 'none':
continue
list_data.append({
"name": text,
"href": host_url + href if href else ''
})
return list_data
# 解析 HTML 内容,提取需要的数据
def parse_page_astro(soup, astro):
astro_div = soup.find("div", id="astro")
if not astro_div:
logging.warning(f"Warning: No 'astro' div found in {astro}")
return None, None
flag = False
list_cnt = 0
list_data = []
next_url = None
birth_date = None
for elem in astro_div.find_all(recursive=False):
if elem.name == "h3" and "astroday" in elem.get("class", []):
birth_date = elem.get_text(strip=True)
elif elem.name == "div" and "perficon" in elem.get("class", []):
a_tag = elem.find("a")
if a_tag:
href = host_url + a_tag["href"]
name = a_tag.find("span", class_="perfname")
if name:
list_data.append({
"astrology": astro,
"birth_date": birth_date,
"person": name.get_text(strip=True),
"href": href
})
flag = True
list_cnt = list_cnt +1
if flag:
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
return list_data, next_url
else:
return None, None
# 解析页面内容并更新birth_map
def parse_page_birth(soup, month, day):
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
if not datarows:
return None, None
flag = False
list_cnt = 0
list_data = []
next_url = None
rows = datarows[0].find_all('div', class_='col-sm-4')
for row in rows:
link_tag = row.find('a')
person = link_tag.text.strip() if link_tag else ''
href = link_tag['href'] if link_tag else ''
href = host_url + href
# 如果 href 已经在 birth_map 中,跳过
flag = True
if any(entry['href'] == href for entry in list_data):
continue
# 将数据添加到 birth_map
list_data.append({
'month': month,
'day': day,
'person': person,
'href': href
})
list_cnt = list_cnt +1
if flag:
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
return list_data, next_url
else:
return None, None
# 解析 HTML 内容,提取需要的数据
def parse_page_ethnic(soup, ethnic):
rows = soup.find_all('div', class_='row headshotrow')
flag = False
list_data = []
next_url = None
for row in rows:
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
link_tag = col.find('a')
img_tag = col.find('div', class_='pictag')
flag = True
if link_tag and img_tag:
href = host_url + link_tag['href']
person = img_tag.text.strip()
# 将数据存储到 ethnic_map
list_data.append({
'ethnic': ethnic,
'person': person,
'href': href
})
if flag:
logging.debug(f"get {len(list_data)} persons from this page.")
next_page = soup.find('a', rel='next')
if next_page:
next_url = host_url + next_page['href']
logging.debug(f"Found next page: {next_url}")
return list_data, next_url
else:
logging.debug(f"All pages fetched for {ethnic}.")
return list_data, None
else:
return None, None
# 解析列表页
def parse_page_dist_stu_list(soup, select_name):
list_data = []
next_url = None
select_element = soup.find('select', {'name': select_name})
if select_element :
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
list_data.append({
'name' : text,
'href' : str(value)
})
return list_data, next_url
else:
return None, None
# 解析 HTML 内容,提取需要的数据
def parse_page_dist_stu(soup, table_id):
table = soup.find("table", id=table_id)
if not table:
logging.warning(f"Warning: No {table_id} table found ")
return None, None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
list_data = []
next_url = None
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
list_data.append({
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return list_data, next_url
# 解析 作品列表,有个人出演,也有导演的
def parse_credits_table(table, distributor_list):
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
movies = []
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
# rows = table.find_all('tr', class_='we')
for row in rows:
#tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串
tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串
cols = row.find_all('td')
if len(cols) >= 6:
title = cols[0].text.strip()
href_a = cols[0].find('a')
href = href_a['href'] if href_a else ''
year = cols[1].text.strip()
distributor = cols[2].text.strip().lower()
href_d = cols[2].find('a')
href_dist = host_url + href_d['href'] if href_d else ''
notes = cols[3].text.strip()
rev = cols[4].text.strip()
formats = cols[5].text.strip()
for key in distributor_list:
if key in distributor:
distributor_count[key] += 1
movies.append({
'title': title,
'href' : href,
'year': year,
'distributor': distributor,
'distributor_href': href_dist,
'notes': notes,
'rev': rev,
'formats': formats,
'tr_class': tr_class
})
return movies, distributor_count
# 请求网页并提取所需数据
def parse_page_performer(soup, url):
# 提取数据
data = {}
# 定义我们需要的字段名称和HTML中对应的标签
fields = {
'performer_aka': 'Performer AKA',
'birthday': 'Birthday',
'astrology': 'Astrology',
'birthplace': 'Birthplace',
'gender': 'Gender',
'years_active': 'Years Active',
'ethnicity': 'Ethnicity',
'nationality': 'Nationality',
'hair_colors': 'Hair Colors',
'eye_color': 'Eye Color',
'height': 'Height',
'weight': 'Weight',
'measurements': 'Measurements',
'tattoos': 'Tattoos',
'piercings': 'Piercings'
}
reversed_map = {v: k for k, v in fields.items()}
# 解析表格数据, 获取参演或者导演的列表
role_list = ['personal', 'directoral']
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
credits_list = {}
# 使用字典来存储统计
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
for role in role_list:
table = soup.find('table', id=role)
if table :
movies, stat_map = parse_credits_table(table, distributor_list)
credits_list[role] = movies
# 更新 distributor 统计
for distributor in distributor_list:
distributor_count[distributor] += stat_map.get(distributor, 0)
# 统计 movies 数量
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
# 如果没有找到
if len(credits_list) == 0 :
logging.warning(f"movie table empty. url: {url} ")
# 遍历每个 bioheading, 获取metadata
bioheadings = soup.find_all('p', class_='bioheading')
for bio in bioheadings:
heading = bio.text.strip()
biodata = None
# 如果包含 "Performer",需要特殊处理
if 'Performer' in heading:
heading = 'Performer AKA'
biodata_div = bio.find_next('div', class_='biodata')
if biodata_div:
div_text = biodata_div.get_text(separator='|').strip()
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
else:
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
# 保存数据
if heading in reversed_map:
kkey = reversed_map[heading]
data[kkey] = biodata
# 添加统计数据到 data
data['movies_cnt'] = movies_cnt
data['vixen_cnt'] = distributor_count['vixen']
data['blacked_cnt'] = distributor_count['blacked']
data['tushy_cnt'] = distributor_count['tushy']
data['x_art_cnt'] = distributor_count['x-art']
data['credits'] = credits_list
return data
# 解析网页 HTML 并提取电影信息
def parse_page_movie(soup, href, title):
# 解析电影基础信息
movie_data = {}
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values):
key = label.text.strip()
if key == "Directors": # 解析多位导演的情况
directors = []
links = value.find_all("a")
for link in links:
director_name = link.text.strip()
director_href = host_url + link['href'] if link['href'] else ''
directors.append({"name": director_name, "href": director_href})
movie_data[key] = directors
else:
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else:
return None
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")
for cast in cast_divs:
performer = {}
link = cast.find("a")
if link:
performer["name"] = link.text.strip()
performer["href"] = host_url + link["href"]
#performer["tags"] = [
# tag.strip() for br in cast.find_all("br")
# if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
#]
tags = []
for br in cast.find_all("br"):
tag = br.next_sibling
if isinstance(tag, str) and tag.strip():
tags.append(tag.strip())
performer["tags"] = tags
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
performers.append(performer)
# 解析场景拆解
scene_breakdowns = []
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip() # 场景编号
performer_info = cols[1] # 包含表演者及链接信息
# 获取 <br> 之前的完整 HTML保留 <i> 标签等格式)
performer_html = str(performer_info) # 获取所有HTML内容
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
split_html = performer_html.split("<br>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
performers_html = performer_html.strip() # 如果没有 <br>,取全部
# 解析为纯文本去除HTML标签仅提取文本内容
performers_soup = BeautifulSoup(performers_html, "html.parser")
performers_text = performers_soup.get_text()
# 提取表演者
scene_performers = [p.strip() for p in performers_text.split(",")]
# 尝试获取 `webscene` 和 `studio`
links_data = {}
links = performer_info.find_all("a")
if links:
webscene_title = links[0].text.strip() if len(links)>0 else None
webscene = links[0]["href"] if len(links)>0 else None
studio = links[1].text.strip() if len(links)>1 else None
studio_lnk = links[1]["href"] if len(links)>1 else None
links_data = {
"title": webscene_title,
"webscene": webscene,
"studio": studio,
"studio_lnk": studio_lnk,
}
scene_data = {
"scene": scene,
"performers": scene_performers,
**links_data,
}
scene_breakdowns.append(scene_data)
appears_in = []
appears_divs = soup.find("div", id="appearssection")
if appears_divs:
rows = appears_divs.find_all("li")
for row in rows:
lnk = row.find("a")
if lnk:
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Directors": movie_data.get("Directors", []), # 可能存在的元素
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
}
if __name__ == "__main__":
for astro in astro_list:
url = astr_base_url + astro
next_url = url
logging.info(f"Fetching data for {astro}, url {url} ...")
while True:
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
if soup:
list_data, next_url = parse_page_astro(soup, astro)
if list_data:
print(list_data[0] if len(list_data)>0 else 'no data')
break
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
time.sleep(2) # 控制访问频率

View File

@ -129,3 +129,11 @@ def replace_lang_param(url: str) -> str:
)
return urlunparse(new_parsed)
def pretty_json_simple(item):
try:
# 转换为单行JSON格式需要保证传入的是map不能是list
return json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
except:
# 转换失败时返回原始字符串
return item