diff --git a/scrapy_proj/cron/cron_scheduler.sh b/scrapy_proj/cron/cron_scheduler.sh
index b027e3b..a74c20d 100755
--- a/scrapy_proj/cron/cron_scheduler.sh
+++ b/scrapy_proj/cron/cron_scheduler.sh
@@ -133,8 +133,9 @@ fi
# 每月任务
if [ "${PERIOD}" = "--monthly" ]; then
- register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
- register_spider "pbox" "scrapy crawl javhd -a mod='update' "
+ register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
+ register_spider "javhd" "scrapy crawl javhd -a mod='update' "
+ register_spider "lord" "scrapy crawl lord -a mod='update' "
fi
diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
index 7d0cddf..8c052bb 100644
--- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
@@ -6,6 +6,7 @@ from datetime import datetime
from typing import List, Dict
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
import scrapy_proj.comm.comm_def as comm
+from scrapy_proj.utils.utils import pretty_json_simple
# 注册器字典
spider_handler_registry = {}
@@ -609,3 +610,61 @@ class JavHDDBHandler(SQLiteDBHandler):
except sqlite3.Error as e:
logging.error(f"query error: {e}")
return 0
+
+
+@register_handler(comm.SPIDER_NAME_LORD)
+class LordDBHandler(SQLiteDBHandler):
+ def __init__(self, db_path=shared_db_path):
+ super().__init__(db_path)
+ self.tbl_name_actors = 'thelordofporn_actress'
+ self.tbl_name_alias = 'thelordofporn_alias'
+
+ def insert_item(self, item):
+ if item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL:
+ self.insert_actor(item)
+ else:
+ logging.error(f"unkown item.")
+
+ return item
+
+ def insert_actor(self, item):
+ actor_id = self.insert_or_update_common(item, self.tbl_name_actors, uniq_key='href', exists_do_nothing=False)
+ if actor_id:
+ for alias in item.get('alias', []):
+ alias_data = {'actress_id':actor_id, 'alias':alias}
+ affected_rows = self.insert_or_update_with_composite_pk(data=alias_data, tbl_name=self.tbl_name_alias, composite_pk=['actress_id','alias'], exists_do_nothing=False)
+ if affected_rows:
+ logging.debug(f"insert/update actress_alias. data: {alias_data}")
+ else:
+ logging.warning(f"insert actor alias error!. data: {alias_data}")
+ else:
+ logging.warning(f"insert actor data error! data: {pretty_json_simple(item)}")
+
+ # 统计函数
+ def get_stat(self):
+ try:
+ self.cursor.execute(f"""
+ SELECT
+ (SELECT COUNT(*) FROM {self.tbl_name_actors}) AS actor_cnt
+ """)
+
+ row = self.cursor.fetchone()
+ if not row:
+ logging.warning(f"query no results.")
+ return {}
+
+ columns = [desc[0] for desc in self.cursor.description]
+ return dict(zip(columns, row))
+
+ except sqlite3.Error as e:
+ logging.error(f"query error: {e}")
+ return {}
+
+ def has_full_data(self, href):
+ try:
+ self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_actors} WHERE is_full_data=1 and href = ?", (href,))
+ row = self.cursor.fetchone()
+ return row[0] if row else None
+ except sqlite3.Error as e:
+ logging.error(f"query error: {e}")
+ return 0
diff --git a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
index 75ce2ea..b74cf6b 100644
--- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
@@ -188,6 +188,68 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
logging.error(f"Error inserting or updating data: {e}")
return None
+ def insert_or_update_with_composite_pk(self, data, tbl_name, composite_pk, exists_do_nothing=True):
+ """
+ 针对联合主键表执行插入或更新操作
+
+ :param table_name: 表名
+ :param data: 字典类型,待插入或更新的数据
+ :param composite_pk: 列表类型,联合主键字段名集合
+ :param need_update: 布尔值,记录存在时是否更新,默认True
+ :return: 操作影响的行数
+ """
+ try:
+ # 校验联合主键参数有效性
+ if not isinstance(composite_pk, list) or len(composite_pk) < 2:
+ logging.error(f"联合主键必须是包含至少两个字段的列表: {composite_pk}")
+ return None
+
+ processed_data = self.check_and_process_data(data, tbl_name)
+
+ # 校验联合主键字段是否都在数据中存在
+ for pk_field in composite_pk:
+ if pk_field not in processed_data:
+ logging.error(f"联合主键字段 '{pk_field}' 未在数据中提供")
+ return None
+
+ # 构建查询条件
+ where_conditions = " AND ".join([f"{pk} = ?" for pk in composite_pk])
+ pk_values = [processed_data[pk] for pk in composite_pk]
+
+ # 检查记录是否存在
+ self.cursor.execute(
+ f"SELECT 1 FROM {tbl_name} WHERE {where_conditions}",
+ pk_values
+ )
+ exists = self.cursor.fetchone() is not None
+
+ if exists:
+ if exists_do_nothing:
+ return 0
+
+ # 构建更新字段(排除联合主键字段)
+ update_fields = [f for f in processed_data.keys() if f not in composite_pk]
+ if not update_fields:
+ return 0
+
+ set_clause = ", ".join([f"{field} = ?" for field in update_fields])
+ update_values = [processed_data[field] for field in update_fields] + pk_values
+
+ # 执行更新(兼容低版本SQLite的标准语法)
+ update_sql = f"UPDATE {tbl_name} SET {set_clause} WHERE {where_conditions}"
+ self.cursor.execute(update_sql, update_values)
+ return 1
+ else:
+ # 执行插入操作
+ columns = ", ".join(processed_data.keys())
+ placeholders = ", ".join(["?" for _ in processed_data.keys()])
+ insert_sql = f"INSERT INTO {tbl_name} ({columns}) VALUES ({placeholders})"
+ self.cursor.execute(insert_sql, list(processed_data.values()))
+ return 2
+ except sqlite3.Error as e:
+ logging.error(f"Error inserting or updating data: {e}")
+ return None
+
def get_id_by_key(self, tbl, uniq_key, val):
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
row = self.cursor.fetchone()
diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py
index c922eed..bea1266 100644
--- a/scrapy_proj/scrapy_proj/items.py
+++ b/scrapy_proj/scrapy_proj/items.py
@@ -192,4 +192,33 @@ class JavHDActorItem(scrapy.Item):
birth_date = scrapy.Field()
ethnicity = scrapy.Field()
birth_place = scrapy.Field()
- is_full_data = scrapy.Field()
\ No newline at end of file
+ is_full_data = scrapy.Field()
+
+
+class LordActorItem(scrapy.Item):
+ item_type = scrapy.Field()
+ pornstar = scrapy.Field()
+ rating = scrapy.Field()
+ rank = scrapy.Field()
+ votes = scrapy.Field()
+ href = scrapy.Field()
+ career_start = scrapy.Field()
+ measurements = scrapy.Field()
+ born = scrapy.Field()
+ height = scrapy.Field()
+ weight = scrapy.Field()
+ date_modified = scrapy.Field()
+ global_rank = scrapy.Field()
+ weekly_rank = scrapy.Field()
+ last_month_rating = scrapy.Field()
+ current_rating = scrapy.Field()
+ total_votes = scrapy.Field()
+ birth_date = scrapy.Field()
+ birth_year = scrapy.Field()
+ birth_place = scrapy.Field()
+ height_ft = scrapy.Field()
+ height_cm = scrapy.Field()
+ weight_lbs = scrapy.Field()
+ weight_kg = scrapy.Field()
+ is_full_data = scrapy.Field()
+ alias = scrapy.Field()
diff --git a/scrapy_proj/scrapy_proj/spiders/base_spider.py b/scrapy_proj/scrapy_proj/spiders/base_spider.py
index 8fff7b7..a768752 100644
--- a/scrapy_proj/scrapy_proj/spiders/base_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/base_spider.py
@@ -31,7 +31,7 @@ class BaseSpider(scrapy.Spider):
yield request
def parse(self, response):
- """统一的响应处理入口"""
+ """统一的响应处理入口,实际上没有起作用,因为直接走了 scrapy.Request 里的 callback """
# 记录请求耗时
request_time = response.meta.get('request_time')
if request_time:
diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
index 9ea1bf8..f288a8b 100644
--- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
@@ -1,15 +1,19 @@
import scrapy
import re
+import sys
+from urllib.parse import urljoin, quote_plus
from scrapy_proj.spiders.base_spider import BaseSpider
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
+from scrapy_proj.spiders.parser.iafd_parser import common_parser
+from scrapy_proj.utils.utils import pretty_json_simple
db_tools = IAFDDBHandler()
class IAFDSpider(BaseSpider):
name = SPIDER_NAME_IAFD
- allowed_domains = ["iafd.com"]
+ allowed_domains = ["iafd.com", "www.iafd.com"]
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
@@ -19,10 +23,10 @@ class IAFDSpider(BaseSpider):
studios_list_url = f"{host_url}/studio.asp"
ethnic_list_url = f'{host_url}/advsearch.asp'
- def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
+ def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
- self.update = int(update)
+ self.update_mode = True if mod and mod.lower() == 'update' else False
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
self.cmd_astro = 'astro'
@@ -64,8 +68,9 @@ class IAFDSpider(BaseSpider):
query_args = {}
if self.debug:
query_args['limit'] = 5
- if self.update == 0:
+ if self.update_mode:
query_args['is_full_data'] = 0
+ query_args['is_full_data'] = 404
# 读取待更新的演员列表
if self.cmd_performers in self.cmd_list:
@@ -77,7 +82,7 @@ class IAFDSpider(BaseSpider):
href = item.get('href', '')
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
self.logger.info(f"fetch from db. item: {item}")
- yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
+ yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'})
# 读取待更新的影片列表
if self.cmd_movies in self.cmd_list:
@@ -88,7 +93,7 @@ class IAFDSpider(BaseSpider):
for item in movies:
href = item.get('href', '')
self.logger.info(f"fetch from db. item: {item}")
- yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
+ yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'})
def start_astro(self):
@@ -113,50 +118,28 @@ class IAFDSpider(BaseSpider):
yield request
def parse_astro_page(self, response):
- astro = response.meta['astro']
- astro_div = response.css('div#astro')
- if astro_div:
- birth_date = None
- for elem in astro_div.css('*'):
- if elem.css('h3.astroday'):
- birth_date = elem.css('h3.astroday::text').get().strip()
- elif elem.css('div.perficon'):
- a_tag = elem.css('a')
- if a_tag:
- href = self.host_url + a_tag.attrib['href']
- name = a_tag.css('span.perfname::text').get()
- if name:
- item = IAFDPersonItem()
- item['name'] = name
- item['href'] = href
- item['from_astro_list'] = 1
- item['from_birth_list'] = 0
- item['from_ethnic_list'] = 0
- item['from_movie_list'] = 0
- yield item
- #yield scrapy.Request(href, callback=self.parse_person_detail_page)
+ astro = response.meta.get('astro', '')
+ data, next_url = common_parser(html=response.text, page='astro', astro=astro)
+ if data:
+ self.logger.debug(f"fetched data from {response.url}, data: {data}")
+ else:
+ self.logger.warning(f"parse data error. {response.url}")
+ item = IAFDPersonDetailItem()
+ #yield item
+
def parse_birth_page(self, response):
month = response.meta['month']
day = response.meta['day']
- datarows = response.css('div.col-sm-12.col-lg-9')
- if datarows:
- rows = datarows[0].css('div.col-sm-4')
- for row in rows:
- link_tag = row.css('a')
- person = link_tag.css('::text').get().strip() if link_tag else ''
- href = self.host_url + link_tag.attrib['href'] if link_tag else ''
-
- item = IAFDPersonItem()
- item['name'] = person
- item['href'] = href
- item['from_astro_list'] = 0
- item['from_birth_list'] = 1
- item['from_ethnic_list'] = 0
- item['from_movie_list'] = 0
- yield item
- #yield scrapy.Request(href, callback=self.parse_person_detail_page)
+ data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
+ if data:
+ self.logger.debug(f"fetched data from {response.url}, data: {data}")
+ else:
+ self.logger.warning(f"parse data error. {response.url}")
+ item = IAFDPersonDetailItem()
+ #yield item
+
def parse_ethnic_list_page(self, response):
div_root = response.css('select#ethnicity1')
if div_root:
@@ -167,40 +150,25 @@ class IAFDSpider(BaseSpider):
href = option.attrib.get('value')
text = option.css('::text').get().strip()
if href and href.lower() != 'none':
- ethnic_url = self.host_url + href
+ ethnic_url = urljoin(response.url , href)
+ self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
- if self.debug:
- break
def parse_ethnic_page(self, response):
ethnic = response.meta['ethnic']
- rows = response.css('div.row.headshotrow')
- for row in rows:
- cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
- for col in cols:
- link_tag = col.css('a')
- img_tag = col.css('div.pictag')
- if link_tag and img_tag:
- href = self.host_url + link_tag.attrib['href']
- person = img_tag.css('::text').get().strip()
-
- item = IAFDPersonItem()
- item['name'] = person
- item['href'] = href
- item['from_astro_list'] = 0
- item['from_birth_list'] = 0
- item['from_ethnic_list'] = 1
- item['from_movie_list'] = 0
- yield item
- #yield scrapy.Request(href, callback=self.parse_person_detail_page)
-
- next_page = response.css('a[rel="next"]')
- if next_page:
- next_url = self.host_url + next_page.attrib['href']
- yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
+ data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
+ if data:
+ self.logger.debug(f"fetched data from {response.url}, data: {data}")
else:
- self.crawler.stats.inc_value(f"{self.name}/ethnic_done")
- self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}")
+ self.logger.warning(f"parse data error. {response.url}")
+
+ if next_url:
+ self.logger.info(f"find next page: {next_url}")
+ else:
+ self.logger.info(f"found all pages. url: {response.url}")
+
+ item = IAFDPersonDetailItem()
+ #yield item
def parse_distributors_list_page(self, response):
select_element = response.css('select[name="Distrib"]')
@@ -209,16 +177,8 @@ class IAFDSpider(BaseSpider):
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
- dis_url = self.host_url + f"/distrib.rme/distrib={value}"
- item = IAFDMovieItem()
- item['title'] = text
- item['href'] = dis_url
- item['release_year'] = 0
- item['from_performer_list'] = 0
- item['from_dist_list'] = 1
- item['from_stu_list'] = 0
- yield item
- #yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
+ dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
+ yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
def parse_studios_list_page(self, response):
select_element = response.css('select[name="Studio"]')
@@ -227,47 +187,54 @@ class IAFDSpider(BaseSpider):
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
- stu_url = self.host_url + f"/studio.rme/studio={value}"
- item = IAFDMovieItem()
- item['title'] = text
- item['href'] = stu_url
- item['release_year'] = 0
- item['from_performer_list'] = 0
- item['from_dist_list'] = 0
- item['from_stu_list'] = 1
- yield item
- #yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
+ dis_url = f"{self.host_url}/studio.rme/studio={value}"
+ yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
+
+ def parse_stu_dist_page(self, response):
+ list_type = response.meta.get('list_type', '')
+ data, next_url = common_parser(html=response.text, page=list_type)
+ if data:
+ self.logger.debug(f"fetched data from {response.url}, data: {data}")
+ else:
+ self.logger.warning(f"fetched data error. {response.url}")
+
+ item = IAFDPersonDetailItem()
+ #yield item
+
def parse_person_detail_page(self, response):
+ data = common_parser(html=response.text, page='actor', url=response.url)
+ if data:
+ self.logger.debug(f"fetched data from {response.url}, data: {data}")
+ else:
+ self.logger.warning(f"fetched data error. {response.url}")
+
item = IAFDPersonDetailItem()
- item['href'] = response.url
- item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中
- # 解析其他详细信息,根据实际页面结构修改
- item['gender'] = response.css('span.gender::text').get()
- item['birthday'] = response.css('span.birthday::text').get()
- item['astrology'] = response.css('span.astrology::text').get()
- item['birthplace'] = response.css('span.birthplace::text').get()
- item['years_active'] = response.css('span.years_active::text').get()
- item['ethnicity'] = response.css('span.ethnicity::text').get()
- item['nationality'] = response.css('span.nationality::text').get()
- item['hair_colors'] = response.css('span.hair_colors::text').get()
- item['eye_color'] = response.css('span.eye_color::text').get()
- item['height'] = response.css('span.height::text').get()
- item['weight'] = response.css('span.weight::text').get()
- item['measurements'] = response.css('span.measurements::text').get()
- item['tattoos'] = response.css('span.tattoos::text').get()
- item['piercings'] = response.css('span.piercings::text').get()
- item['movies_cnt'] = response.css('span.movies_cnt::text').get()
- item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
- item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
- item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
- item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
- item['performer_aka'] = response.css('span.performer_aka::text').getall()
- yield item
+ #yield item
def parse_movie_detail_page(self, response):
+ title = response.meta.get('title', '')
+ data = common_parser(html=response.text, page='movies', href=response.url, title=title)
+ if data:
+ self.logger.debug(f"fetched data from {response.url}, data: {data}")
+ else:
+ self.logger.warning(f"fetched data error. {response.url}")
+
item = IAFDMovieDetailItem()
- item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中
- item['href'] = response.url
- # 解析其他详细信息,根据实际页面结构修改
- yield item
\ No newline at end of file
+ #yield item
+
+ def custom_block_check(self, response):
+ item_type = response.meta.get('item_type', '')
+ if "invalid or outdated page" in response.text.lower():
+ self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
+ return "invalid or outdated page"
+ else:
+ self.logger.info(f"right content. url: {response.url}")
+
+ return None
+
+ # 处理页面异常,主要是404, 403
+ def handle_blocked(self, response, reason):
+ item_type = response.meta.get('item_type', '')
+ if response.status in [404, 403]:
+ self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
\ No newline at end of file
diff --git a/scrapy_proj/scrapy_proj/spiders/javhd_spider.py b/scrapy_proj/scrapy_proj/spiders/javhd_spider.py
index 5c6d1f7..cb7360a 100644
--- a/scrapy_proj/scrapy_proj/spiders/javhd_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/javhd_spider.py
@@ -111,7 +111,7 @@ class JavhdSpider(BaseSpider):
item['rank'] = rank
item['url'] = url
item[f'{lang}_name'] = name
- #TODO: 非英语的页面,要去更新对应的名字
+ # 非英语的页面,要去更新对应的名字
if lang != 'en':
item['url'] = replace_lang_param(item['url'])
yield item
@@ -127,7 +127,7 @@ class JavhdSpider(BaseSpider):
meta={"list_item": item} # 传递列表页数据到详情页
)
else:
- self.logger.info(f"actor(name) has full data. skip. url: {url}")
+ self.logger.info(f"actor({name}) has full data. skip. url: {url}")
# 获取下一页
next_path = data.get("pagination_params", {}).get("next")
diff --git a/scrapy_proj/scrapy_proj/spiders/lord_spider.py b/scrapy_proj/scrapy_proj/spiders/lord_spider.py
new file mode 100644
index 0000000..44d690e
--- /dev/null
+++ b/scrapy_proj/scrapy_proj/spiders/lord_spider.py
@@ -0,0 +1,399 @@
+import scrapy
+import sys
+import re
+from urllib.parse import urljoin, quote_plus
+from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param, pretty_json_simple
+from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
+from scrapy_proj.items import LordActorItem
+from scrapy_proj.comm.comm_def import SPIDER_NAME_LORD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL
+from scrapy_proj.db_wapper.spider_db_handler import LordDBHandler
+
+db_tools = LordDBHandler()
+
+class LordSpider(BaseSpider):
+ name = SPIDER_NAME_LORD
+ allowed_domains = ["www.thelordofporn.com", "thelordofporn.com"]
+
+ # 配置请求头(复用curl中的头部信息)
+ custom_settings = {
+ "DEFAULT_REQUEST_HEADERS": {
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+ "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+ "if-modified-since": "Wed, 23 Jul 2025 14:34:28 GMT",
+ "priority": "u=0, i",
+ "sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
+ "sec-ch-ua-mobile": "?0",
+ "sec-ch-ua-platform": "\"macOS\"",
+ "sec-fetch-dest": "document",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "none",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0"
+ },
+ "COOKIES_ENABLED": True # 启用Cookie支持
+ }
+
+ def __init__(self, debug='false', mod='update', *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
+ self.update_mod = False if mod and mod.lower() == 'force' else True
+
+ self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
+
+ # 入口函数,由基类的方法触发
+ def custom_start_requests(self):
+ url = 'https://thelordofporn.com/pornstars/'
+ yield scrapy.Request(
+ url=url,
+ headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
+ callback=self.parse_list,
+ meta={} # 传递列表页数据到详情页
+ )
+
+ def parse_list(self, response):
+ # 提取所有演员条目(对应原代码中的article.loop-item)
+ articles = response.css("article.loop-item")
+ self.logger.info(f"当前页({response.url})找到 {len(articles)} 个演员条目")
+
+ for article in articles:
+ try:
+ # 提取演员名称和详情页链接
+ title_tag = article.css("h3.loop-item__title a")
+ title = title_tag.css("::text").get(default="N/A").strip()
+ href = title_tag.attrib.get("href") # 获取a标签的href属性
+
+ # 提取评分
+ rating = article.css("div.loop-item__rating::text").get(default="N/A").strip()
+
+ # 提取排名和投票数(对应原代码中的meta_tags)
+ meta_tags = article.css("div.loop-item__rank span")
+ rank = None
+ votes = None
+
+ # 解析排名(第一个span中的b标签)
+ if len(meta_tags) >= 1:
+ rank_b = meta_tags[0].css("b::text").get()
+ rank = rank_b.strip() if rank_b else "N/A"
+
+ # 解析投票数(第二个span中的b标签)
+ if len(meta_tags) >= 2:
+ votes_b = meta_tags[1].css("b::text").get()
+ votes = votes_b.strip() if votes_b else "N/A"
+
+ # 转换为数值类型(模拟原代码中的utils.parse_numeric)
+ def parse_numeric(value):
+ if not value or value == "N/A":
+ return None
+ # 移除非数字字符(如逗号、%等)
+ numeric_str = ''.join(filter(str.isdigit, value))
+ return int(numeric_str) if numeric_str else None
+
+ # 构建演员数据字典
+ actress_data = {
+ "pornstar": title,
+ "rating": parse_numeric(rating),
+ "rank": parse_numeric(rank),
+ "votes": parse_numeric(votes),
+ "href": href if href else None
+ }
+ # 发起详情查询
+ actor_exists = 0 if not self.update_mod else db_tools.has_full_data(href)
+ if actor_exists < 1 :
+ yield scrapy.Request(
+ url=href,
+ callback=self.parse_actor_detail,
+ headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
+ meta = {'actor':actress_data}
+ )
+ else:
+ self.logger.info(f"actor({title}) has full data. skip. url: {href}")
+
+ except Exception as e:
+ self.logger.error(f"解析演员条目失败: {e}, 页面: {response.url}")
+ continue # 跳过错误条目,继续解析下一个
+
+ # 提取下一页链接(对应原代码中的.next.page-numbers)
+ next_page_url = None
+ next_page_tag = response.css(".nav-links .next.page-numbers")
+ if next_page_tag:
+ next_page_href = next_page_tag.attrib.get("href")
+ if next_page_href and not self.debug:
+ # 拼接完整URL(处理相对路径)
+ next_page_url = urljoin(response.url, next_page_href)
+ yield scrapy.Request(
+ url=next_page_url,
+ callback=self.parse_list,
+ headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
+ meta = {}
+ )
+ else:
+ self.logger.info(f"已解析所有页面, current url: {response.url}")
+
+ def parse_actor_detail(self, response):
+ # 1. 定义字段映射表:页面原始字段 -> Item字段
+ FIELD_MAPPING = {
+ # 基本信息
+ 'date_modified': 'date_modified',
+ # 排名信息
+ 'Global Rank': 'global_rank',
+ 'Weekly Rank': 'weekly_rank',
+ # 评分信息
+ 'Last Month': 'last_month_rating',
+ 'Rating Av.': 'current_rating',
+ 'Total of Votes': 'total_votes',
+ # 详细属性
+ 'Career start': 'career_start',
+ 'Measurements': 'measurements',
+ 'Born': 'born',
+ 'Height': 'height',
+ 'Weight': 'weight',
+ 'Name': 'alias_raw', # 别名对应Name字段
+ # 解析后字段(出生/身高/体重)
+ 'birth_date': 'birth_date',
+ 'birth_year': 'birth_year',
+ 'birth_place': 'birth_place',
+ 'height_ft': 'height_ft',
+ 'height_cm': 'height_cm',
+ 'weight_lbs': 'weight_lbs',
+ 'weight_kg': 'weight_kg',
+ 'alias':'alias'
+ }
+
+ # 2. 初始化原始数据容器
+ raw_data = {}
+ # 3. 提取基础信息
+ raw_data['href'] = response.url
+ entry_header = response.css("header.entry-header")
+ raw_data['name'] = entry_header.css("h1.entry-title::text").get(default="").strip()
+ raw_data['date_modified'] = entry_header.css("time[itemprop='dateModified']::attr(content)").get(default="").strip()
+
+ # 4. 提取排名信息
+ for item in entry_header.css("div.porn-star-rank__item"):
+ item_text = item.css("::text").get(default="").strip()
+ raw_data[item_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
+
+ # 5. 提取评分和投票信息
+ for item in response.css("div.specifications__item--horizontal"):
+ # 1. 精准定位标题区域(排除b标签)
+ # 情况1:有子div的结构(如Rating Av.带img)
+ title_div = item.css("div:first-child")
+ if title_div:
+ # 只提取子div内的文本(自动排除同级的b标签)
+ title_parts = title_div.css("::text").getall()
+ else:
+ # 情况2和3:无子div的结构(Last Month和Total of Votes)
+ # 提取当前item内所有文本,但排除b标签的内容
+ all_text_parts = item.css("::text").getall()
+ b_text_parts = item.css("b::text").getall()
+ # 从所有文本中移除b标签的文本
+ title_parts = [t for t in all_text_parts if t not in b_text_parts]
+
+ # 2. 清理标题文本(处理非断空格和空白)
+ title_text = "".join(title_parts)
+ title_text = title_text.replace(u'\xa0', u' ') # 替换非断空格
+ title_text = re.sub(r'\s+', ' ', title_text).strip() # 合并空白
+
+ raw_data[title_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
+
+ # 6. 提取详细属性(specifications-grid-row)
+ for row in response.css("div.specifications-grid-row"):
+ items = row.css("div.specifications-grid-item")
+ for i in [0, 1]: # 处理每行2个属性
+ if i < len(items):
+ label = extract_text_from_element(items[i].css("h5"))
+ value = extract_text_from_element(items[i].css("span"))
+ if label:
+ raw_data[label] = value
+
+ # 7. 处理特殊字段(别名需要清洗)
+ raw_data['alias'] = self.clean_alias(raw_data.get("Name", ""))
+
+ # 9. 解析出生信息、身高、体重并合并
+ raw_data.update(self.parse_birth_info(raw_data.get("Born", "")))
+ raw_data.update(self.parse_height(raw_data.get("Height", "")))
+ raw_data.update(self.parse_weight(raw_data.get("Weight", "")))
+
+ # 10. 映射到Item并返回
+ item = LordActorItem()
+ item['item_type'] = ITEM_TYPE_ACTOR_DETAIL
+ actor_data = response.meta['actor']
+ for k, v in actor_data.items():
+ if k in item.fields:
+ item[k] = v
+
+ for raw_field, item_field in FIELD_MAPPING.items():
+ if item_field in item.fields:
+ item[item_field] = raw_data.get(raw_field, "")
+
+ # 标记为完整数据
+ item['is_full_data'] = 1
+ self.logger.info(f"actor data: {raw_data}, meta: {response.meta['actor']}, item: {pretty_json_simple(item)}")
+
+ yield item
+
+ # 保留原工具函数(需作为Spider类的方法)
+ def parse_birth_info(self, text):
+ match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text, re.IGNORECASE)
+ if match:
+ return {
+ "birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
+ "birth_year": match.group(3),
+ "birth_place": match.group(4),
+ }
+ return {"birth_date": text, "birth_year": "", "birth_place": ""}
+
+
+ def parse_height2(self, text):
+ match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text, re.IGNORECASE)
+ if match:
+ height_ft = f"{match.group(1)}'{match.group(2)}\""
+ return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
+ return {"height_ft": text, "height_cm": ""}
+ def parse_height(self, text):
+ # 统一预处理:替换逗号为小数点,处理常见笔误(如'n'→'in')
+ text = text.replace(',', '.').replace(' n ', ' in ').strip()
+
+ # 正则表达式:匹配所有英尺+英寸格式(支持多种表达方式)
+ # 分组说明:
+ # 1. 英尺数值 2. 英尺单位(feet/ft/ft./') 3. 英寸数值 4. 英寸单位(inches/in/in./inch/")
+ # 5. 厘米/米数值 6. 单位(cm/m)
+ pattern = r"""
+ # 情况1:先英尺英寸,后厘米/米(主流格式)
+ (?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分(如5ft/5')
+ (?:and\s*)? # 可选的"and"(如5 feet and 2 inches)
+ (\d+)\s*(inches|in\.?|inch|")?\s* # 英寸部分(如2in/2")
+ (?:\(?(\d+\.?\d*)\s*(cm|m)\)?) # 厘米/米部分(如(157cm)/(1.57m))
+
+ | # 或
+
+ # 情况2:先厘米,后英尺英寸(如170 cm / 5 feet and 7 inches)
+ (\d+)\s*cm\s*/\s* # 厘米在前
+ (?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分
+ (?:and\s*)?
+ (\d+)\s*(inches|in\.?|inch|")? # 英寸部分
+
+ | # 或
+
+ # 情况3:纯简写格式(如5'3" (160 cm))
+ (\d+)'(\d+)"\s*\(?(\d+)\s*cm\)? # 5'3"格式
+ """
+
+ # 使用VERBOSE忽略正则中的空格,IGNORECASE忽略大小写
+ match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
+ if not match:
+ # 处理纯厘米格式(如"160cm")
+ cm_match = re.match(r'(\d+)\s*cm', text, re.IGNORECASE)
+ if cm_match:
+ return {"height_ft": "", "height_cm": cm_match.group(1)}
+ return {"height_ft": text, "height_cm": ""}
+
+ # 提取匹配结果(根据不同情况处理分组)
+ ft = None
+ inch = None
+ cm = None
+
+ # 情况1:先英尺英寸后厘米/米
+ if match.group(1) and match.group(3):
+ ft = match.group(1)
+ inch = match.group(3)
+ num = match.group(5)
+ unit = match.group(6).lower() if match.group(6) else 'cm'
+
+ # 情况2:先厘米后英尺英寸
+ elif match.group(7):
+ cm = match.group(7)
+ ft = match.group(8)
+ inch = match.group(10)
+ unit = 'cm' # 情况2中前面的单位固定为cm
+
+ # 情况3:纯简写格式(5'3")
+ elif match.group(11) and match.group(12):
+ ft = match.group(11)
+ inch = match.group(12)
+ cm = match.group(13)
+ unit = 'cm'
+
+ # 处理厘米/米转换(米转厘米)
+ if not cm and num and unit:
+ if unit == 'm':
+ cm = str(int(float(num) * 100)) # 1.57m → 157cm
+ else:
+ cm = num # 直接使用cm数值
+
+ # 格式化英尺英寸表达式(如5'2")
+ height_ft = f"{ft}'{inch}\"" if ft and inch else ""
+
+ return {"height_ft": height_ft.strip(), "height_cm": cm.strip() if cm else ""}
+
+
+ def parse_weight2(self, text):
+ match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text, re.IGNORECASE)
+ if match:
+ return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
+ return {"weight_lbs": text, "weight_kg": ""}
+
+ def parse_weight(self, text):
+ # 预处理:清理空格和常见格式问题
+ text = text.strip().replace(' ', ' ')
+
+ # 正则表达式:匹配多种体重格式
+ # 分组说明:
+ # 1. 磅数值 2. 磅单位(lb/lbs/pounds) 3. 千克数值 4. 千克单位(kg)
+ # 5. 千克在前的数值 6. 千克单位 7. 磅在后的数值 8. 磅单位
+ pattern = r"""
+ # 情况1:磅在前,千克在后(主流格式)
+ (?:(\d+)\s*(lb|lbs|pounds)?\s*) # 磅部分(支持lb/lbs/pounds或省略单位)
+ (?:\(?\s*(\d+)\s*(kg)\s*\)?) # 千克部分(如(45 kg))
+
+ | # 或
+
+ # 情况2:千克在前,磅在后(如52 kg / 114 lbs)
+ (?:(\d+)\s*(kg)\s*/\s*) # 千克部分
+ (\d+)\s*(lb|lbs|pounds)? # 磅部分
+ """
+
+ # 使用VERBOSE和IGNORECASE标志增强兼容性
+ match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
+ if not match:
+ # 尝试匹配纯千克格式(如"52kg")
+ kg_match = re.match(r'(\d+)\s*kg', text, re.IGNORECASE)
+ if kg_match:
+ return {"weight_lbs": "", "weight_kg": kg_match.group(1)}
+
+ # 尝试匹配纯磅格式(如"114lb")
+ lb_match = re.match(r'(\d+)\s*(lb|lbs|pounds)', text, re.IGNORECASE)
+ if lb_match:
+ return {"weight_lbs": lb_match.group(1), "weight_kg": ""}
+
+ # 完全无法解析的情况
+ return {"weight_lbs": text, "weight_kg": ""}
+
+ # 提取匹配结果(根据不同情况处理分组)
+ weight_lbs = None
+ weight_kg = None
+
+ # 情况1:磅在前,千克在后
+ if match.group(1) and match.group(3):
+ weight_lbs = match.group(1)
+ weight_kg = match.group(3)
+
+ # 情况2:千克在前,磅在后
+ elif match.group(5) and match.group(6):
+ weight_kg = match.group(5)
+ weight_lbs = match.group(7)
+
+ return {
+ "weight_lbs": weight_lbs.strip() if weight_lbs else "",
+ "weight_kg": weight_kg.strip() if weight_kg else ""
+ }
+
+ def clean_alias(self, alias):
+ alias = re.sub(r'\(Age \d+\)', '', alias, re.IGNORECASE)
+ return [name.strip() for name in alias.split(',') if name.strip()]
+
+ def parse_numeric(self, value):
+ try:
+ return float(value)
+ except (ValueError, TypeError):
+ return 0
diff --git a/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py b/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
new file mode 100644
index 0000000..3464612
--- /dev/null
+++ b/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
@@ -0,0 +1,636 @@
+
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+#import config
+#import utils
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+
+astr_base_url = f"{host_url}/astrology.rme/sign="
+astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
+
+birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
+
+distributors_list_url = f'{host_url}/distrib.asp'
+distributors_base_url = f"{host_url}/distrib.rme/distrib="
+
+studios_list_url = f"{host_url}/studio.asp"
+studios_base_url = f"{host_url}/studio.rme/studio="
+
+ethnic_list_url = f'{host_url}/advsearch.asp'
+
+# 设置 headers 和 scraper
+headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+http_code_404 = 404
+http_code_login = 401
+http_code_url = 601
+http_code_local = 99
+
+save_raw_html = True
+load_from_local = False
+
+def common_parser(html, page, **kwargs):
+ parser = "lxml" if page=='ethnic' else "html.parser"
+ soup = BeautifulSoup(html, parser)
+ if not soup:
+ return None
+ if page == 'astro':
+ #parse_page_astro(soup, astro):
+ return parse_page_astro(soup, **kwargs)
+ elif page == 'birth':
+ #parse_page_birth(soup, month, day):
+ return parse_page_birth(soup, **kwargs)
+ elif page == 'ethnic':
+ #parse_page_ethnic(soup, ethnic):
+ return parse_page_ethnic(soup, **kwargs)
+ elif page == 'dist':
+ return parse_page_dist_stu(soup,'distable')
+ elif page == 'stu':
+ return parse_page_dist_stu(soup,'studio')
+ elif page == 'actor':
+ #parse_page_performer(soup, url):
+ return parse_page_performer(soup, **kwargs)
+ elif page == 'movies':
+ #parse_page_movie(soup, href, title)
+ return parse_page_movie(soup, **kwargs)
+ else:
+ logging.warning(f"wrong page: {page}")
+ return None
+
+'''
+#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
+def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+ if load_from_local: # 从本地读取的逻辑
+ html = utils.read_raw_html(url)
+ if html:
+ # 预处理 HTML(如果提供了 preprocessor)
+ html_text = preprocessor(html) if preprocessor else html
+
+ soup = BeautifulSoup(html_text, parser)
+ if validator(soup): # 进行自定义页面检查
+ return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
+
+ for attempt in range(max_retries):
+ try:
+ if host_url not in url.lower():
+ logging.error(f'wrong url format: {url}')
+ return None, http_code_url
+
+ response = scraper.get(url, headers=headers)
+
+ # 处理 HTTP 状态码
+ if response.status_code == 404:
+ logging.debug(f"Page not found (404): {url}")
+ return None, http_code_404 # 直接返回 404,调用方可以跳过
+
+ response.raise_for_status() # 处理 HTTP 错误
+
+ # 过期的网页,与404相同处理
+ if "invalid or outdated page" in response.text.lower():
+ logging.debug(f"invalid or outdated page: {url}")
+ return None, http_code_404 # 直接返回 404,调用方可以跳过
+
+ if save_raw_html:
+ utils.write_raw_html(url, response.text)
+
+ # 预处理 HTML(如果提供了 preprocessor)
+ html_text = preprocessor(response.text) if preprocessor else response.text
+
+ soup = BeautifulSoup(html_text, parser)
+ if validator(soup): # 进行自定义页面检查
+ return soup, response.status_code
+ else:
+ # 检查是否发生跳转,比如到登录页面
+ if response.history:
+ logging.warning(f"Page redirected on {url}. Validation failed.")
+ return None, http_code_login
+
+ logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+ except cloudscraper.exceptions.CloudflareChallengeError as e:
+ logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+ except cloudscraper.exceptions.CloudflareCode1020 as e:
+ logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+ except Exception as e:
+ logging.error(f"Unexpected error on {url}: {e}, Retring...")
+
+ logging.error(f'Fetching failed after max retries. {url}')
+ return None, None # 达到最大重试次数仍然失败
+'''
+
+# 修复 HTML 结构,去除多余标签并修正 标签,在获取人种的时候需要
+def preprocess_html(html):
+ return html.replace('
', '').replace(' 标签
+ options = div_root.find_all('option')
+ if options:
+ # 解析并输出 value 和文本内容
+ for option in options:
+ href = option.get('value', None)
+ text = option.text.strip()
+ if href and href.lower() == 'none':
+ continue
+ list_data.append({
+ "name": text,
+ "href": host_url + href if href else ''
+ })
+ return list_data
+
+
+# 解析 HTML 内容,提取需要的数据
+def parse_page_astro(soup, astro):
+ astro_div = soup.find("div", id="astro")
+ if not astro_div:
+ logging.warning(f"Warning: No 'astro' div found in {astro}")
+ return None, None
+
+ flag = False
+ list_cnt = 0
+ list_data = []
+ next_url = None
+
+ birth_date = None
+ for elem in astro_div.find_all(recursive=False):
+ if elem.name == "h3" and "astroday" in elem.get("class", []):
+ birth_date = elem.get_text(strip=True)
+ elif elem.name == "div" and "perficon" in elem.get("class", []):
+ a_tag = elem.find("a")
+ if a_tag:
+ href = host_url + a_tag["href"]
+ name = a_tag.find("span", class_="perfname")
+ if name:
+ list_data.append({
+ "astrology": astro,
+ "birth_date": birth_date,
+ "person": name.get_text(strip=True),
+ "href": href
+ })
+ flag = True
+ list_cnt = list_cnt +1
+ if flag:
+ logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
+ return list_data, next_url
+ else:
+ return None, None
+
+
+# 解析页面内容并更新birth_map
+def parse_page_birth(soup, month, day):
+ datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
+ if not datarows:
+ return None, None
+
+ flag = False
+ list_cnt = 0
+ list_data = []
+ next_url = None
+ rows = datarows[0].find_all('div', class_='col-sm-4')
+ for row in rows:
+ link_tag = row.find('a')
+ person = link_tag.text.strip() if link_tag else ''
+ href = link_tag['href'] if link_tag else ''
+ href = host_url + href
+
+ # 如果 href 已经在 birth_map 中,跳过
+ flag = True
+ if any(entry['href'] == href for entry in list_data):
+ continue
+
+ # 将数据添加到 birth_map
+ list_data.append({
+ 'month': month,
+ 'day': day,
+ 'person': person,
+ 'href': href
+ })
+ list_cnt = list_cnt +1
+
+ if flag:
+ logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
+ return list_data, next_url
+ else:
+ return None, None
+
+
+# 解析 HTML 内容,提取需要的数据
+def parse_page_ethnic(soup, ethnic):
+ rows = soup.find_all('div', class_='row headshotrow')
+ flag = False
+ list_data = []
+ next_url = None
+
+ for row in rows:
+ for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
+ link_tag = col.find('a')
+ img_tag = col.find('div', class_='pictag')
+ flag = True
+
+ if link_tag and img_tag:
+ href = host_url + link_tag['href']
+ person = img_tag.text.strip()
+
+ # 将数据存储到 ethnic_map
+ list_data.append({
+ 'ethnic': ethnic,
+ 'person': person,
+ 'href': href
+ })
+ if flag:
+ logging.debug(f"get {len(list_data)} persons from this page.")
+
+ next_page = soup.find('a', rel='next')
+ if next_page:
+ next_url = host_url + next_page['href']
+ logging.debug(f"Found next page: {next_url}")
+ return list_data, next_url
+ else:
+ logging.debug(f"All pages fetched for {ethnic}.")
+ return list_data, None
+ else:
+ return None, None
+
+# 解析列表页
+def parse_page_dist_stu_list(soup, select_name):
+ list_data = []
+ next_url = None
+
+ select_element = soup.find('select', {'name': select_name})
+ if select_element :
+ options = select_element.find_all('option')
+ for option in options:
+ value = option.get('value') # 获取 value 属性
+ text = option.text.strip() # 获取文本内容
+ list_data.append({
+ 'name' : text,
+ 'href' : str(value)
+ })
+ return list_data, next_url
+ else:
+ return None, None
+
+# 解析 HTML 内容,提取需要的数据
+def parse_page_dist_stu(soup, table_id):
+ table = soup.find("table", id=table_id)
+ if not table:
+ logging.warning(f"Warning: No {table_id} table found ")
+ return None, None
+
+ # 找到thead并跳过
+ thead = table.find('thead')
+ if thead:
+ thead.decompose() # 去掉thead部分,不需要解析
+
+ # 现在只剩下tbody部分
+ tbody = table.find('tbody')
+ rows = tbody.find_all('tr') if tbody else []
+
+ list_data = []
+ next_url = None
+ for row in rows:
+ cols = row.find_all('td')
+ if len(cols) >= 5:
+ title = cols[0].text.strip()
+ label = cols[1].text.strip()
+ year = cols[2].text.strip()
+ rev = cols[3].text.strip()
+ a_href = cols[0].find('a')
+ href = host_url + a_href['href'] if a_href else ''
+
+ list_data.append({
+ 'title': title,
+ 'label': label,
+ 'year': year,
+ 'rev': rev,
+ 'href': href
+ })
+ return list_data, next_url
+
+
+# 解析 作品列表,有个人出演,也有导演的
+def parse_credits_table(table, distributor_list):
+ # 找到thead并跳过
+ thead = table.find('thead')
+ if thead:
+ thead.decompose() # 去掉thead部分,不需要解析
+
+ # 现在只剩下tbody部分
+ tbody = table.find('tbody')
+ rows = tbody.find_all('tr') if tbody else []
+
+ movies = []
+ distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
+
+ # rows = table.find_all('tr', class_='we')
+ for row in rows:
+ #tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串
+ tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串
+ cols = row.find_all('td')
+ if len(cols) >= 6:
+ title = cols[0].text.strip()
+ href_a = cols[0].find('a')
+ href = href_a['href'] if href_a else ''
+ year = cols[1].text.strip()
+ distributor = cols[2].text.strip().lower()
+ href_d = cols[2].find('a')
+ href_dist = host_url + href_d['href'] if href_d else ''
+ notes = cols[3].text.strip()
+ rev = cols[4].text.strip()
+ formats = cols[5].text.strip()
+
+ for key in distributor_list:
+ if key in distributor:
+ distributor_count[key] += 1
+
+ movies.append({
+ 'title': title,
+ 'href' : href,
+ 'year': year,
+ 'distributor': distributor,
+ 'distributor_href': href_dist,
+ 'notes': notes,
+ 'rev': rev,
+ 'formats': formats,
+ 'tr_class': tr_class
+ })
+ return movies, distributor_count
+
+
+# 请求网页并提取所需数据
+def parse_page_performer(soup, url):
+ # 提取数据
+ data = {}
+
+ # 定义我们需要的字段名称和HTML中对应的标签
+ fields = {
+ 'performer_aka': 'Performer AKA',
+ 'birthday': 'Birthday',
+ 'astrology': 'Astrology',
+ 'birthplace': 'Birthplace',
+ 'gender': 'Gender',
+ 'years_active': 'Years Active',
+ 'ethnicity': 'Ethnicity',
+ 'nationality': 'Nationality',
+ 'hair_colors': 'Hair Colors',
+ 'eye_color': 'Eye Color',
+ 'height': 'Height',
+ 'weight': 'Weight',
+ 'measurements': 'Measurements',
+ 'tattoos': 'Tattoos',
+ 'piercings': 'Piercings'
+ }
+ reversed_map = {v: k for k, v in fields.items()}
+
+ # 解析表格数据, 获取参演或者导演的列表
+ role_list = ['personal', 'directoral']
+ distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
+ credits_list = {}
+
+ # 使用字典来存储统计
+ distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
+ for role in role_list:
+ table = soup.find('table', id=role)
+ if table :
+ movies, stat_map = parse_credits_table(table, distributor_list)
+ credits_list[role] = movies
+ # 更新 distributor 统计
+ for distributor in distributor_list:
+ distributor_count[distributor] += stat_map.get(distributor, 0)
+
+ # 统计 movies 数量
+ #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
+ movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
+
+ # 如果没有找到
+ if len(credits_list) == 0 :
+ logging.warning(f"movie table empty. url: {url} ")
+
+ # 遍历每个 bioheading, 获取metadata
+ bioheadings = soup.find_all('p', class_='bioheading')
+ for bio in bioheadings:
+ heading = bio.text.strip()
+ biodata = None
+
+ # 如果包含 "Performer",需要特殊处理
+ if 'Performer' in heading:
+ heading = 'Performer AKA'
+ biodata_div = bio.find_next('div', class_='biodata')
+ if biodata_div:
+ div_text = biodata_div.get_text(separator='|').strip()
+ biodata = [b.strip() for b in div_text.split('|') if b.strip()]
+ else:
+ biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
+
+ # 保存数据
+ if heading in reversed_map:
+ kkey = reversed_map[heading]
+ data[kkey] = biodata
+
+ # 添加统计数据到 data
+ data['movies_cnt'] = movies_cnt
+ data['vixen_cnt'] = distributor_count['vixen']
+ data['blacked_cnt'] = distributor_count['blacked']
+ data['tushy_cnt'] = distributor_count['tushy']
+ data['x_art_cnt'] = distributor_count['x-art']
+ data['credits'] = credits_list
+
+ return data
+
+
+
+# 解析网页 HTML 并提取电影信息
+def parse_page_movie(soup, href, title):
+ # 解析电影基础信息
+ movie_data = {}
+ info_div = soup.find("div", class_="col-xs-12 col-sm-3")
+ if info_div:
+ labels = info_div.find_all("p", class_="bioheading")
+ values = info_div.find_all("p", class_="biodata")
+ for label, value in zip(labels, values):
+ key = label.text.strip()
+ if key == "Directors": # 解析多位导演的情况
+ directors = []
+ links = value.find_all("a")
+ for link in links:
+ director_name = link.text.strip()
+ director_href = host_url + link['href'] if link['href'] else ''
+ directors.append({"name": director_name, "href": director_href})
+ movie_data[key] = directors
+ else:
+ val = value.text.strip()
+ if key in ["Distributor", "Studio", "Director"]:
+ link = value.find("a")
+ if link:
+ val = link.text.strip()
+ movie_data[f'{key}Href'] = host_url + link['href']
+ movie_data[key] = val
+ else:
+ return None
+
+ # 解析演职人员信息
+ performers = []
+ cast_divs = soup.find_all("div", class_="castbox")
+ for cast in cast_divs:
+ performer = {}
+ link = cast.find("a")
+ if link:
+ performer["name"] = link.text.strip()
+ performer["href"] = host_url + link["href"]
+
+ #performer["tags"] = [
+ # tag.strip() for br in cast.find_all("br")
+ # if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
+ #]
+
+ tags = []
+ for br in cast.find_all("br"):
+ tag = br.next_sibling
+ if isinstance(tag, str) and tag.strip():
+ tags.append(tag.strip())
+ performer["tags"] = tags
+
+ #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
+ performers.append(performer)
+
+ # 解析场景拆解
+ scene_breakdowns = []
+ scene_table = soup.find("div", id="sceneinfo")
+ if scene_table:
+ rows = scene_table.find_all("tr")
+
+ for row in rows:
+ cols = row.find_all("td")
+ if len(cols) >= 2:
+ scene = cols[0].text.strip() # 场景编号
+ performer_info = cols[1] # 包含表演者及链接信息
+
+ # 获取
之前的完整 HTML(保留 标签等格式)
+ performer_html = str(performer_info) # 获取所有HTML内容
+ split_html = performer_html.split("
") # 按
进行分割
+ if split_html:
+ performers_html = split_html[0].strip() # 取
之前的部分
+ else:
+ split_html = performer_html.split("
") # 按
进行分割
+ if split_html:
+ performers_html = split_html[0].strip() # 取
之前的部分
+ else:
+ performers_html = performer_html.strip() # 如果没有
,取全部
+
+ # 解析为纯文本(去除HTML标签,仅提取文本内容)
+ performers_soup = BeautifulSoup(performers_html, "html.parser")
+ performers_text = performers_soup.get_text()
+
+ # 提取表演者
+ scene_performers = [p.strip() for p in performers_text.split(",")]
+
+ # 尝试获取 `webscene` 和 `studio`
+ links_data = {}
+ links = performer_info.find_all("a")
+ if links:
+ webscene_title = links[0].text.strip() if len(links)>0 else None
+ webscene = links[0]["href"] if len(links)>0 else None
+ studio = links[1].text.strip() if len(links)>1 else None
+ studio_lnk = links[1]["href"] if len(links)>1 else None
+ links_data = {
+ "title": webscene_title,
+ "webscene": webscene,
+ "studio": studio,
+ "studio_lnk": studio_lnk,
+ }
+
+ scene_data = {
+ "scene": scene,
+ "performers": scene_performers,
+ **links_data,
+ }
+ scene_breakdowns.append(scene_data)
+
+ appears_in = []
+ appears_divs = soup.find("div", id="appearssection")
+ if appears_divs:
+ rows = appears_divs.find_all("li")
+ for row in rows:
+ lnk = row.find("a")
+ if lnk:
+ appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
+
+
+ return {
+ "href": href,
+ "title": title,
+ "Minutes": movie_data.get("Minutes", ""),
+ "Distributor": movie_data.get("Distributor", ""),
+ "Studio": movie_data.get("Studio", ""),
+ "ReleaseDate": movie_data.get("Release Date", ""),
+ "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
+ "All-Girl": movie_data.get("All-Girl", ""),
+ "All-Male": movie_data.get("All-Male", ""),
+ "Compilation": movie_data.get("Compilation", ""),
+ "Webscene": movie_data.get("Webscene", ""),
+ "Director": movie_data.get("Director", ""),
+ "DirectorHref": movie_data.get("DirectorHref", ""),
+ "DistributorHref": movie_data.get("DistributorHref", ""),
+ "StudioHref": movie_data.get("StudioHref", ""),
+ "Directors": movie_data.get("Directors", []), # 可能存在的元素
+ "Performers": performers,
+ "SceneBreakdowns": scene_breakdowns,
+ "AppearsIn": appears_in,
+ }
+
+
+if __name__ == "__main__":
+
+ for astro in astro_list:
+ url = astr_base_url + astro
+ next_url = url
+ logging.info(f"Fetching data for {astro}, url {url} ...")
+
+ while True:
+ soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
+ if soup:
+ list_data, next_url = parse_page_astro(soup, astro)
+ if list_data:
+ print(list_data[0] if len(list_data)>0 else 'no data')
+ break
+ else:
+ logging.info(f"Retrying {next_url} ...")
+ time.sleep(5) # 等待后再重试
+
+ time.sleep(2) # 控制访问频率
\ No newline at end of file
diff --git a/scrapy_proj/scrapy_proj/utils/utils.py b/scrapy_proj/scrapy_proj/utils/utils.py
index 042d2e3..a1e6557 100644
--- a/scrapy_proj/scrapy_proj/utils/utils.py
+++ b/scrapy_proj/scrapy_proj/utils/utils.py
@@ -129,3 +129,11 @@ def replace_lang_param(url: str) -> str:
)
return urlunparse(new_parsed)
+def pretty_json_simple(item):
+ try:
+ # 转换为单行JSON格式,需要保证传入的是map,不能是list
+ return json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
+ except:
+ # 转换失败时返回原始字符串
+ return item
+
\ No newline at end of file