modify scripts

2025-07-24 19:13:56 +08:00
parent cc6530d73a
commit 50d829364b
10 changed files with 1289 additions and 128 deletions
--- a/scrapy_proj/cron/cron_scheduler.sh
+++ b/scrapy_proj/cron/cron_scheduler.sh
@ -134,7 +134,8 @@ fi
 # 每月任务
 if [ "${PERIOD}" = "--monthly" ]; then
    register_spider "pbox"  "scrapy crawl pbox  -a begin=${COMMON_DATE_PARAM} -a mod='update' "
-    register_spider "pbox" "scrapy crawl javhd -a mod='update' "
+    register_spider "javhd" "scrapy crawl javhd -a mod='update' "
    register_spider "lord"  "scrapy crawl lord  -a mod='update' "
 fi
--- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
@ -6,6 +6,7 @@ from datetime import datetime
 from typing import List, Dict
 from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
 import scrapy_proj.comm.comm_def as comm
 from scrapy_proj.utils.utils import pretty_json_simple
 # 注册器字典
 spider_handler_registry = {}
@ -609,3 +610,61 @@ class JavHDDBHandler(SQLiteDBHandler):
        except sqlite3.Error as e:
            logging.error(f"query error: {e}")
            return 0
@register_handler(comm.SPIDER_NAME_LORD)
 class LordDBHandler(SQLiteDBHandler):
    def __init__(self, db_path=shared_db_path):
        super().__init__(db_path)
        self.tbl_name_actors = 'thelordofporn_actress'
        self.tbl_name_alias  = 'thelordofporn_alias'
    def insert_item(self, item):
        if item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL:
            self.insert_actor(item)
        else:
            logging.error(f"unkown item.") 
        return item
    def insert_actor(self, item):
        actor_id = self.insert_or_update_common(item, self.tbl_name_actors, uniq_key='href', exists_do_nothing=False)
        if actor_id:
            for alias in item.get('alias', []):
                alias_data = {'actress_id':actor_id, 'alias':alias}
                affected_rows = self.insert_or_update_with_composite_pk(data=alias_data, tbl_name=self.tbl_name_alias, composite_pk=['actress_id','alias'], exists_do_nothing=False)
                if affected_rows:
                    logging.debug(f"insert/update actress_alias. data: {alias_data}")
                else:
                    logging.warning(f"insert actor alias error!. data: {alias_data}")
        else:
            logging.warning(f"insert actor data error! data: {pretty_json_simple(item)}")
    # 统计函数
    def get_stat(self):
        try:
            self.cursor.execute(f"""
                SELECT
                    (SELECT COUNT(*) FROM {self.tbl_name_actors}) AS actor_cnt
            """)
            row = self.cursor.fetchone()
            if not row:
                logging.warning(f"query no results.")
                return {}
            columns = [desc[0] for desc in self.cursor.description]
            return dict(zip(columns, row))
        except sqlite3.Error as e:
            logging.error(f"query error: {e}")
            return {}
    def has_full_data(self, href):
        try:
            self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_actors} WHERE is_full_data=1 and href = ?", (href,))
            row = self.cursor.fetchone()
            return row[0] if row else None
        except sqlite3.Error as e:
            logging.error(f"query error: {e}")
            return 0
--- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
@ -188,6 +188,68 @@ class SQLiteDBHandler(metaclass=SingletonMeta):  # 应用单例元类
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def insert_or_update_with_composite_pk(self, data, tbl_name, composite_pk, exists_do_nothing=True):
        """
        针对联合主键表执行插入或更新操作
        :param table_name: 表名
        :param data: 字典类型，待插入或更新的数据
        :param composite_pk: 列表类型，联合主键字段名集合
        :param need_update: 布尔值，记录存在时是否更新，默认True
        :return: 操作影响的行数
        """
        try:
            # 校验联合主键参数有效性
            if not isinstance(composite_pk, list) or len(composite_pk) < 2:
                logging.error(f"联合主键必须是包含至少两个字段的列表: {composite_pk}")
                return None
            processed_data = self.check_and_process_data(data, tbl_name)
            # 校验联合主键字段是否都在数据中存在
            for pk_field in composite_pk:
                if pk_field not in processed_data:
                    logging.error(f"联合主键字段 '{pk_field}' 未在数据中提供")
                    return None
            # 构建查询条件
            where_conditions = " AND ".join([f"{pk} = ?" for pk in composite_pk])
            pk_values = [processed_data[pk] for pk in composite_pk]
            # 检查记录是否存在
            self.cursor.execute(
                f"SELECT 1 FROM {tbl_name} WHERE {where_conditions}",
                pk_values
            )
            exists = self.cursor.fetchone() is not None
            if exists:
                if exists_do_nothing:
                    return 0
                # 构建更新字段（排除联合主键字段）
                update_fields = [f for f in processed_data.keys() if f not in composite_pk]
                if not update_fields:
                    return 0
                set_clause = ", ".join([f"{field} = ?" for field in update_fields])
                update_values = [processed_data[field] for field in update_fields] + pk_values
                # 执行更新（兼容低版本SQLite的标准语法）
                update_sql = f"UPDATE {tbl_name} SET {set_clause} WHERE {where_conditions}"
                self.cursor.execute(update_sql, update_values)
                return 1
            else:
                # 执行插入操作
                columns = ", ".join(processed_data.keys())
                placeholders = ", ".join(["?" for _ in processed_data.keys()])
                insert_sql = f"INSERT INTO {tbl_name} ({columns}) VALUES ({placeholders})"
                self.cursor.execute(insert_sql, list(processed_data.values()))
                return 2
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def get_id_by_key(self, tbl, uniq_key, val):
        self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
        row = self.cursor.fetchone()
--- a/scrapy_proj/scrapy_proj/items.py
+++ b/scrapy_proj/scrapy_proj/items.py
@ -193,3 +193,32 @@ class JavHDActorItem(scrapy.Item):
    ethnicity = scrapy.Field()
    birth_place = scrapy.Field()
    is_full_data = scrapy.Field()
 class LordActorItem(scrapy.Item):
    item_type = scrapy.Field()
    pornstar = scrapy.Field()
    rating = scrapy.Field()
    rank = scrapy.Field()
    votes = scrapy.Field()
    href = scrapy.Field()
    career_start = scrapy.Field()
    measurements = scrapy.Field()
    born = scrapy.Field()
    height = scrapy.Field()
    weight = scrapy.Field()
    date_modified = scrapy.Field()
    global_rank = scrapy.Field()
    weekly_rank = scrapy.Field()
    last_month_rating = scrapy.Field()
    current_rating = scrapy.Field()
    total_votes = scrapy.Field()
    birth_date = scrapy.Field()
    birth_year = scrapy.Field()
    birth_place = scrapy.Field()
    height_ft = scrapy.Field()
    height_cm = scrapy.Field()
    weight_lbs = scrapy.Field()
    weight_kg = scrapy.Field()
    is_full_data = scrapy.Field()
    alias = scrapy.Field()
--- a/scrapy_proj/scrapy_proj/spiders/base_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/base_spider.py
@ -31,7 +31,7 @@ class BaseSpider(scrapy.Spider):
                yield request
    def parse(self, response):
-        """统一的响应处理入口"""
+        """统一的响应处理入口，实际上没有起作用，因为直接走了 scrapy.Request 里的 callback """
        # 记录请求耗时
        request_time = response.meta.get('request_time')
        if request_time:
--- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
@ -1,15 +1,19 @@
 import scrapy
 import re
 import sys
 from urllib.parse import urljoin, quote_plus
 from scrapy_proj.spiders.base_spider import BaseSpider
 from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
 from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
 from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
 from scrapy_proj.spiders.parser.iafd_parser import common_parser
 from scrapy_proj.utils.utils import pretty_json_simple
 db_tools = IAFDDBHandler()
 class IAFDSpider(BaseSpider):
    name = SPIDER_NAME_IAFD
-    allowed_domains = ["iafd.com"]
+    allowed_domains = ["iafd.com", "www.iafd.com"]
    host_url = "https://www.iafd.com"
    astr_base_url = f"{host_url}/astrology.rme/sign="
@ -19,10 +23,10 @@ class IAFDSpider(BaseSpider):
    studios_list_url = f"{host_url}/studio.asp"
    ethnic_list_url = f'{host_url}/advsearch.asp'
-    def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
+    def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
-        self.update = int(update)
+        self.update_mode = True if mod and mod.lower() == 'update' else False
        self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
        self.cmd_astro = 'astro'
@ -64,8 +68,9 @@ class IAFDSpider(BaseSpider):
        query_args = {}
        if self.debug:
            query_args['limit'] = 5
-        if self.update == 0:
+        if self.update_mode:
            query_args['is_full_data'] = 0
        query_args['is_full_data'] = 404
        # 读取待更新的演员列表
        if self.cmd_performers in self.cmd_list:
@ -77,7 +82,7 @@ class IAFDSpider(BaseSpider):
                    href = item.get('href', '')
                    movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
                    self.logger.info(f"fetch from db. item: {item}")
-                    yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
+                    yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'})
        # 读取待更新的影片列表
        if self.cmd_movies in self.cmd_list:
@ -88,7 +93,7 @@ class IAFDSpider(BaseSpider):
                for item in movies:
                    href = item.get('href', '')
                    self.logger.info(f"fetch from db. item: {item}")
-                    yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
+                    yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'})
    def start_astro(self):
@ -113,49 +118,27 @@ class IAFDSpider(BaseSpider):
            yield request
    def parse_astro_page(self, response):
-        astro = response.meta['astro']
+        astro = response.meta.get('astro', '')
-        astro_div = response.css('div#astro')
+        data, next_url = common_parser(html=response.text, page='astro', astro=astro)
-        if astro_div:
+        if data:
-            birth_date = None
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
-            for elem in astro_div.css('*'):
+        else:
-                if elem.css('h3.astroday'):
+            self.logger.warning(f"parse data error. {response.url}")
-                    birth_date = elem.css('h3.astroday::text').get().strip()
+
-                elif elem.css('div.perficon'):
+        item = IAFDPersonDetailItem()
-                    a_tag = elem.css('a')
+        #yield item
                    if a_tag:
                        href = self.host_url + a_tag.attrib['href']
                        name = a_tag.css('span.perfname::text').get()
                        if name:
                            item = IAFDPersonItem()
                            item['name'] = name
                            item['href'] = href
                            item['from_astro_list'] = 1
                            item['from_birth_list'] = 0
                            item['from_ethnic_list'] = 0
                            item['from_movie_list'] = 0
                            yield item
                            #yield scrapy.Request(href, callback=self.parse_person_detail_page)
    def parse_birth_page(self, response):
        month = response.meta['month']
        day = response.meta['day']
-        datarows = response.css('div.col-sm-12.col-lg-9')
+        data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
-        if datarows:
+        if data:
-            rows = datarows[0].css('div.col-sm-4')
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
-            for row in rows:
+        else:
-                link_tag = row.css('a')
+            self.logger.warning(f"parse data error. {response.url}")
                person = link_tag.css('::text').get().strip() if link_tag else ''
                href = self.host_url + link_tag.attrib['href'] if link_tag else ''
-                item = IAFDPersonItem()
+        item = IAFDPersonDetailItem()
-                item['name'] = person
+        #yield item
                item['href'] = href
                item['from_astro_list'] = 0
                item['from_birth_list'] = 1
                item['from_ethnic_list'] = 0
                item['from_movie_list'] = 0
                yield item
                #yield scrapy.Request(href, callback=self.parse_person_detail_page)
    def parse_ethnic_list_page(self, response):
        div_root = response.css('select#ethnicity1')
@ -167,40 +150,25 @@ class IAFDSpider(BaseSpider):
                href = option.attrib.get('value')
                text = option.css('::text').get().strip()
                if href and href.lower() != 'none':
-                    ethnic_url = self.host_url + href
+                    ethnic_url = urljoin(response.url , href)
                    self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
                    yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
                    if self.debug:
                        break
    def parse_ethnic_page(self, response):
        ethnic = response.meta['ethnic']
-        rows = response.css('div.row.headshotrow')
+        data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
-        for row in rows:
+        if data:
-            cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
            for col in cols:
                link_tag = col.css('a')
                img_tag = col.css('div.pictag')
                if link_tag and img_tag:
                    href = self.host_url + link_tag.attrib['href']
                    person = img_tag.css('::text').get().strip()
                    item = IAFDPersonItem()
                    item['name'] = person
                    item['href'] = href
                    item['from_astro_list'] = 0
                    item['from_birth_list'] = 0
                    item['from_ethnic_list'] = 1
                    item['from_movie_list'] = 0
                    yield item
                    #yield scrapy.Request(href, callback=self.parse_person_detail_page)
        next_page = response.css('a[rel="next"]')
        if next_page:
            next_url = self.host_url + next_page.attrib['href']
            yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
        else:
-            self.crawler.stats.inc_value(f"{self.name}/ethnic_done")
+            self.logger.warning(f"parse data error. {response.url}")
-            self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}")
+
        if next_url:
            self.logger.info(f"find next page: {next_url}")
        else:
            self.logger.info(f"found all pages. url: {response.url}")
        item = IAFDPersonDetailItem()
        #yield item
    def parse_distributors_list_page(self, response):
        select_element = response.css('select[name="Distrib"]')
@ -209,16 +177,8 @@ class IAFDSpider(BaseSpider):
            for option in options:
                value = option.attrib.get('value')
                text = option.css('::text').get().strip()
-                dis_url = self.host_url + f"/distrib.rme/distrib={value}"
+                dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
-                item = IAFDMovieItem()
+                yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
                item['title'] = text
                item['href'] = dis_url
                item['release_year'] = 0
                item['from_performer_list'] = 0
                item['from_dist_list'] = 1
                item['from_stu_list'] = 0
                yield item
                #yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
    def parse_studios_list_page(self, response):
        select_element = response.css('select[name="Studio"]')
@ -227,47 +187,54 @@ class IAFDSpider(BaseSpider):
            for option in options:
                value = option.attrib.get('value')
                text = option.css('::text').get().strip()
-                stu_url = self.host_url + f"/studio.rme/studio={value}"
+                dis_url = f"{self.host_url}/studio.rme/studio={value}"
-                item = IAFDMovieItem()
+                yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
-                item['title'] = text
+
-                item['href'] = stu_url
+    def parse_stu_dist_page(self, response):
-                item['release_year'] = 0
+        list_type = response.meta.get('list_type', '')
-                item['from_performer_list'] = 0
+        data, next_url = common_parser(html=response.text, page=list_type)
-                item['from_dist_list'] = 0
+        if data:
-                item['from_stu_list'] = 1
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
-                yield item
+        else:
-                #yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
+            self.logger.warning(f"fetched data error. {response.url}")
        item = IAFDPersonDetailItem()
        #yield item
    def parse_person_detail_page(self, response):
        data = common_parser(html=response.text, page='actor', url=response.url)
        if data:
            self.logger.debug(f"fetched data from {response.url}, data: {data}")
        else:
            self.logger.warning(f"fetched data error. {response.url}")
        item = IAFDPersonDetailItem()
-        item['href'] = response.url
+        #yield item
        item['person'] = response.css('h1::text').get()  # 假设姓名在 h1 标签中
        # 解析其他详细信息，根据实际页面结构修改
        item['gender'] = response.css('span.gender::text').get()
        item['birthday'] = response.css('span.birthday::text').get()
        item['astrology'] = response.css('span.astrology::text').get()
        item['birthplace'] = response.css('span.birthplace::text').get()
        item['years_active'] = response.css('span.years_active::text').get()
        item['ethnicity'] = response.css('span.ethnicity::text').get()
        item['nationality'] = response.css('span.nationality::text').get()
        item['hair_colors'] = response.css('span.hair_colors::text').get()
        item['eye_color'] = response.css('span.eye_color::text').get()
        item['height'] = response.css('span.height::text').get()
        item['weight'] = response.css('span.weight::text').get()
        item['measurements'] = response.css('span.measurements::text').get()
        item['tattoos'] = response.css('span.tattoos::text').get()
        item['piercings'] = response.css('span.piercings::text').get()
        item['movies_cnt'] = response.css('span.movies_cnt::text').get()
        item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
        item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
        item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
        item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
        item['performer_aka'] = response.css('span.performer_aka::text').getall()
        yield item
    def parse_movie_detail_page(self, response):
        title = response.meta.get('title', '')
        data = common_parser(html=response.text, page='movies', href=response.url, title=title)
        if data:
            self.logger.debug(f"fetched data from {response.url}, data: {data}")
        else:
            self.logger.warning(f"fetched data error. {response.url}")
        item = IAFDMovieDetailItem()
-        item['title'] = response.css('h1::text').get()  # 假设标题在 h1 标签中
+        #yield item
-        item['href'] = response.url
+    
-        # 解析其他详细信息，根据实际页面结构修改
+    def custom_block_check(self, response):
-        yield item
+        item_type = response.meta.get('item_type', '')
        if "invalid or outdated page" in response.text.lower():
            self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
            return "invalid or outdated page"
        else:
            self.logger.info(f"right content. url: {response.url}")
        return None            
    # 处理页面异常，主要是404, 403
    def handle_blocked(self, response, reason):
        item_type = response.meta.get('item_type', '')
        if response.status in [404, 403]:
            self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
--- a/scrapy_proj/scrapy_proj/spiders/javhd_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/javhd_spider.py
@ -111,7 +111,7 @@ class JavhdSpider(BaseSpider):
            item['rank'] = rank
            item['url'] = url
            item[f'{lang}_name'] = name
-            #TODO: 非英语的页面，要去更新对应的名字
+            # 非英语的页面，要去更新对应的名字
            if lang != 'en':
                item['url'] = replace_lang_param(item['url'])
                yield item 
@ -127,7 +127,7 @@ class JavhdSpider(BaseSpider):
                        meta={"list_item": item}  # 传递列表页数据到详情页
                    )
                else:
-                    self.logger.info(f"actor(name) has full data. skip. url: {url}")
+                    self.logger.info(f"actor({name}) has full data. skip. url: {url}")
        # 获取下一页
        next_path = data.get("pagination_params", {}).get("next")
--- a/scrapy_proj/scrapy_proj/spiders/lord_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/lord_spider.py
@ -0,0 +1,399 @@
 import scrapy
 import sys
 import re
 from urllib.parse import urljoin, quote_plus
 from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param, pretty_json_simple
 from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
 from scrapy_proj.items import LordActorItem
 from scrapy_proj.comm.comm_def import SPIDER_NAME_LORD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL
 from scrapy_proj.db_wapper.spider_db_handler import LordDBHandler
 db_tools = LordDBHandler()
 class LordSpider(BaseSpider):
    name = SPIDER_NAME_LORD
    allowed_domains = ["www.thelordofporn.com", "thelordofporn.com"]
    # 配置请求头（复用curl中的头部信息）
    custom_settings = {
        "DEFAULT_REQUEST_HEADERS": {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
            "if-modified-since": "Wed, 23 Jul 2025 14:34:28 GMT",
            "priority": "u=0, i",
            "sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"macOS\"",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0"
        },
        "COOKIES_ENABLED": True  # 启用Cookie支持
    }
    def __init__(self, debug='false', mod='update', *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
        self.update_mod = False if mod and mod.lower() == 'force' else  True
        self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") 
    # 入口函数，由基类的方法触发
    def custom_start_requests(self):    
        url = 'https://thelordofporn.com/pornstars/'
        yield scrapy.Request(
            url=url,
            headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),  # 使用GET头
            callback=self.parse_list,
            meta={}  # 传递列表页数据到详情页
        )
    def parse_list(self, response):
        # 提取所有演员条目（对应原代码中的article.loop-item）
        articles = response.css("article.loop-item")
        self.logger.info(f"当前页({response.url})找到 {len(articles)} 个演员条目")
        for article in articles:
            try:
                # 提取演员名称和详情页链接
                title_tag = article.css("h3.loop-item__title a")
                title = title_tag.css("::text").get(default="N/A").strip()
                href = title_tag.attrib.get("href")  # 获取a标签的href属性
                # 提取评分
                rating = article.css("div.loop-item__rating::text").get(default="N/A").strip()
                # 提取排名和投票数（对应原代码中的meta_tags）
                meta_tags = article.css("div.loop-item__rank span")
                rank = None
                votes = None
                # 解析排名（第一个span中的b标签）
                if len(meta_tags) >= 1:
                    rank_b = meta_tags[0].css("b::text").get()
                    rank = rank_b.strip() if rank_b else "N/A"
                # 解析投票数（第二个span中的b标签）
                if len(meta_tags) >= 2:
                    votes_b = meta_tags[1].css("b::text").get()
                    votes = votes_b.strip() if votes_b else "N/A"
                # 转换为数值类型（模拟原代码中的utils.parse_numeric）
                def parse_numeric(value):
                    if not value or value == "N/A":
                        return None
                    # 移除非数字字符（如逗号、%等）
                    numeric_str = ''.join(filter(str.isdigit, value))
                    return int(numeric_str) if numeric_str else None
                # 构建演员数据字典
                actress_data = {
                    "pornstar": title,
                    "rating": parse_numeric(rating),
                    "rank": parse_numeric(rank),
                    "votes": parse_numeric(votes),
                    "href": href if href else None
                }
                # 发起详情查询
                actor_exists = 0 if not self.update_mod else db_tools.has_full_data(href)
                if actor_exists < 1  :                
                    yield scrapy.Request(
                        url=href,
                        callback=self.parse_actor_detail,
                        headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
                        meta = {'actor':actress_data}
                    )
                else:
                    self.logger.info(f"actor({title}) has full data. skip. url: {href}")
            except Exception as e:
                self.logger.error(f"解析演员条目失败: {e}, 页面: {response.url}")
                continue  # 跳过错误条目，继续解析下一个
        # 提取下一页链接（对应原代码中的.next.page-numbers）
        next_page_url = None
        next_page_tag = response.css(".nav-links .next.page-numbers")
        if next_page_tag:
            next_page_href = next_page_tag.attrib.get("href")
            if next_page_href and not self.debug:
                # 拼接完整URL（处理相对路径）
                next_page_url = urljoin(response.url, next_page_href)
                yield scrapy.Request(
                    url=next_page_url,
                    callback=self.parse_list,
                    headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
                    meta = {}
                )
        else:
            self.logger.info(f"已解析所有页面, current url: {response.url}")
    def parse_actor_detail(self, response):
        # 1. 定义字段映射表：页面原始字段 -> Item字段
        FIELD_MAPPING = {
            # 基本信息
            'date_modified': 'date_modified',
            # 排名信息
            'Global Rank': 'global_rank',
            'Weekly Rank': 'weekly_rank',
            # 评分信息
            'Last Month': 'last_month_rating',
            'Rating Av.': 'current_rating',
            'Total of Votes': 'total_votes',
            # 详细属性
            'Career start': 'career_start',
            'Measurements': 'measurements',
            'Born': 'born',
            'Height': 'height',
            'Weight': 'weight',
            'Name': 'alias_raw',  # 别名对应Name字段
            # 解析后字段（出生/身高/体重）
            'birth_date': 'birth_date',
            'birth_year': 'birth_year',
            'birth_place': 'birth_place',
            'height_ft': 'height_ft',
            'height_cm': 'height_cm',
            'weight_lbs': 'weight_lbs',
            'weight_kg': 'weight_kg',
            'alias':'alias'
        }
        # 2. 初始化原始数据容器
        raw_data = {}
        # 3. 提取基础信息
        raw_data['href'] = response.url
        entry_header = response.css("header.entry-header")
        raw_data['name'] = entry_header.css("h1.entry-title::text").get(default="").strip()
        raw_data['date_modified'] = entry_header.css("time[itemprop='dateModified']::attr(content)").get(default="").strip()
        # 4. 提取排名信息
        for item in entry_header.css("div.porn-star-rank__item"):
            item_text = item.css("::text").get(default="").strip()
            raw_data[item_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
        # 5. 提取评分和投票信息
        for item in response.css("div.specifications__item--horizontal"):
            # 1. 精准定位标题区域（排除b标签）
            # 情况1：有子div的结构（如Rating Av.带img）
            title_div = item.css("div:first-child")
            if title_div:
                # 只提取子div内的文本（自动排除同级的b标签）
                title_parts = title_div.css("::text").getall()
            else:
                # 情况2和3：无子div的结构（Last Month和Total of Votes）
                # 提取当前item内所有文本，但排除b标签的内容
                all_text_parts = item.css("::text").getall()
                b_text_parts = item.css("b::text").getall()
                # 从所有文本中移除b标签的文本
                title_parts = [t for t in all_text_parts if t not in b_text_parts]
            # 2. 清理标题文本（处理非断空格和空白）
            title_text = "".join(title_parts)
            title_text = title_text.replace(u'\xa0', u' ')  # 替换非断空格
            title_text = re.sub(r'\s+', ' ', title_text).strip()  # 合并空白
            raw_data[title_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
        # 6. 提取详细属性（specifications-grid-row）
        for row in response.css("div.specifications-grid-row"):
            items = row.css("div.specifications-grid-item")
            for i in [0, 1]:  # 处理每行2个属性
                if i < len(items):
                    label = extract_text_from_element(items[i].css("h5"))
                    value = extract_text_from_element(items[i].css("span"))
                    if label:
                        raw_data[label] = value
        # 7. 处理特殊字段（别名需要清洗）
        raw_data['alias'] = self.clean_alias(raw_data.get("Name", ""))
        # 9. 解析出生信息、身高、体重并合并
        raw_data.update(self.parse_birth_info(raw_data.get("Born", "")))
        raw_data.update(self.parse_height(raw_data.get("Height", "")))
        raw_data.update(self.parse_weight(raw_data.get("Weight", "")))
        # 10. 映射到Item并返回
        item = LordActorItem()
        item['item_type'] = ITEM_TYPE_ACTOR_DETAIL
        actor_data = response.meta['actor']
        for k, v in actor_data.items():
            if k in item.fields:
                item[k] = v
        for raw_field, item_field in FIELD_MAPPING.items():
            if item_field in item.fields:
                item[item_field] = raw_data.get(raw_field, "")
        # 标记为完整数据
        item['is_full_data'] = 1
        self.logger.info(f"actor data: {raw_data}, meta: {response.meta['actor']}, item: {pretty_json_simple(item)}")
        yield item
    # 保留原工具函数（需作为Spider类的方法）
    def parse_birth_info(self, text):
        match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text, re.IGNORECASE)
        if match:
            return {
                "birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
                "birth_year": match.group(3),
                "birth_place": match.group(4),
            }
        return {"birth_date": text, "birth_year": "", "birth_place": ""}
    def parse_height2(self, text):
        match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text, re.IGNORECASE)
        if match:
            height_ft = f"{match.group(1)}'{match.group(2)}\""
            return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
        return {"height_ft": text, "height_cm": ""}
    def parse_height(self, text):
        # 统一预处理：替换逗号为小数点，处理常见笔误（如'n'→'in'）
        text = text.replace(',', '.').replace(' n ', ' in ').strip()
        # 正则表达式：匹配所有英尺+英寸格式（支持多种表达方式）
        # 分组说明：
        # 1. 英尺数值  2. 英尺单位（feet/ft/ft./'）  3. 英寸数值  4. 英寸单位（inches/in/in./inch/"）
        # 5. 厘米/米数值  6. 单位（cm/m）
        pattern = r"""
            # 情况1：先英尺英寸，后厘米/米（主流格式）
            (?:(\d+)\s*(feet|ft\.?|')\s*)                  # 英尺部分（如5ft/5'）
            (?:and\s*)?                                    # 可选的"and"（如5 feet and 2 inches）
            (\d+)\s*(inches|in\.?|inch|")?\s*              # 英寸部分（如2in/2"）
            (?:\(?(\d+\.?\d*)\s*(cm|m)\)?)                 # 厘米/米部分（如(157cm)/(1.57m)）
            |                                               # 或
            # 情况2：先厘米，后英尺英寸（如170 cm / 5 feet and 7 inches）
            (\d+)\s*cm\s*/\s*                              # 厘米在前
            (?:(\d+)\s*(feet|ft\.?|')\s*)                  # 英尺部分
            (?:and\s*)?
            (\d+)\s*(inches|in\.?|inch|")?                 # 英寸部分
            |                                               # 或
            # 情况3：纯简写格式（如5'3" (160 cm)）
            (\d+)'(\d+)"\s*\(?(\d+)\s*cm\)?                # 5'3"格式
        """
        # 使用VERBOSE忽略正则中的空格，IGNORECASE忽略大小写
        match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
        if not match:
            # 处理纯厘米格式（如"160cm"）
            cm_match = re.match(r'(\d+)\s*cm', text, re.IGNORECASE)
            if cm_match:
                return {"height_ft": "", "height_cm": cm_match.group(1)}
            return {"height_ft": text, "height_cm": ""}
        # 提取匹配结果（根据不同情况处理分组）
        ft = None
        inch = None
        cm = None
        # 情况1：先英尺英寸后厘米/米
        if match.group(1) and match.group(3):
            ft = match.group(1)
            inch = match.group(3)
            num = match.group(5)
            unit = match.group(6).lower() if match.group(6) else 'cm'
        # 情况2：先厘米后英尺英寸
        elif match.group(7):
            cm = match.group(7)
            ft = match.group(8)
            inch = match.group(10)
            unit = 'cm'  # 情况2中前面的单位固定为cm
        # 情况3：纯简写格式（5'3"）
        elif match.group(11) and match.group(12):
            ft = match.group(11)
            inch = match.group(12)
            cm = match.group(13)
            unit = 'cm'
        # 处理厘米/米转换（米转厘米）
        if not cm and num and unit:
            if unit == 'm':
                cm = str(int(float(num) * 100))  # 1.57m → 157cm
            else:
                cm = num  # 直接使用cm数值
        # 格式化英尺英寸表达式（如5'2"）
        height_ft = f"{ft}'{inch}\"" if ft and inch else ""
        return {"height_ft": height_ft.strip(), "height_cm": cm.strip() if cm else ""}
    def parse_weight2(self, text):
        match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text, re.IGNORECASE)
        if match:
            return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
        return {"weight_lbs": text, "weight_kg": ""}
    def parse_weight(self, text):
        # 预处理：清理空格和常见格式问题
        text = text.strip().replace('  ', ' ')
        # 正则表达式：匹配多种体重格式
        # 分组说明：
        # 1. 磅数值  2. 磅单位(lb/lbs/pounds)  3. 千克数值  4. 千克单位(kg)
        # 5. 千克在前的数值  6. 千克单位  7. 磅在后的数值  8. 磅单位
        pattern = r"""
            # 情况1：磅在前，千克在后（主流格式）
            (?:(\d+)\s*(lb|lbs|pounds)?\s*)               # 磅部分（支持lb/lbs/pounds或省略单位）
            (?:\(?\s*(\d+)\s*(kg)\s*\)?)                 # 千克部分（如(45 kg)）
            |                                             # 或
            # 情况2：千克在前，磅在后（如52 kg / 114 lbs）
            (?:(\d+)\s*(kg)\s*/\s*)                       # 千克部分
            (\d+)\s*(lb|lbs|pounds)?                      # 磅部分
        """
        # 使用VERBOSE和IGNORECASE标志增强兼容性
        match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
        if not match:
            # 尝试匹配纯千克格式（如"52kg"）
            kg_match = re.match(r'(\d+)\s*kg', text, re.IGNORECASE)
            if kg_match:
                return {"weight_lbs": "", "weight_kg": kg_match.group(1)}
            # 尝试匹配纯磅格式（如"114lb"）
            lb_match = re.match(r'(\d+)\s*(lb|lbs|pounds)', text, re.IGNORECASE)
            if lb_match:
                return {"weight_lbs": lb_match.group(1), "weight_kg": ""}
            # 完全无法解析的情况
            return {"weight_lbs": text, "weight_kg": ""}
        # 提取匹配结果（根据不同情况处理分组）
        weight_lbs = None
        weight_kg = None
        # 情况1：磅在前，千克在后
        if match.group(1) and match.group(3):
            weight_lbs = match.group(1)
            weight_kg = match.group(3)
        # 情况2：千克在前，磅在后
        elif match.group(5) and match.group(6):
            weight_kg = match.group(5)
            weight_lbs = match.group(7)
        return {
            "weight_lbs": weight_lbs.strip() if weight_lbs else "",
            "weight_kg": weight_kg.strip() if weight_kg else ""
        }
    def clean_alias(self, alias):
        alias = re.sub(r'\(Age \d+\)', '', alias, re.IGNORECASE)
        return [name.strip() for name in alias.split(',') if name.strip()]
    def parse_numeric(self, value):
        try:
            return float(value)
        except (ValueError, TypeError):
            return 0
--- a/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
+++ b/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
@ -0,0 +1,636 @@
 import cloudscraper
 import time
 import json
 import csv
 import logging
 import signal
 import sys
 import os
 import re
 from bs4 import BeautifulSoup
 from requests.exceptions import RequestException
 from functools import partial
 #import config
 #import utils
 # 定义基础 URL 和可变参数
 host_url = "https://www.iafd.com"
 astr_base_url = f"{host_url}/astrology.rme/sign="
 astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
 birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
 distributors_list_url = f'{host_url}/distrib.asp'
 distributors_base_url = f"{host_url}/distrib.rme/distrib="
 studios_list_url = f"{host_url}/studio.asp"
 studios_base_url = f"{host_url}/studio.rme/studio="
 ethnic_list_url = f'{host_url}/advsearch.asp'
 # 设置 headers 和 scraper
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 }
 scraper = cloudscraper.create_scraper()
 http_code_404   = 404
 http_code_login = 401
 http_code_url = 601
 http_code_local = 99
 save_raw_html = True
 load_from_local = False
 def common_parser(html, page, **kwargs):
    parser = "lxml" if page=='ethnic' else "html.parser" 
    soup = BeautifulSoup(html, parser)
    if not soup:
        return None
    if page == 'astro':
        #parse_page_astro(soup, astro):
        return parse_page_astro(soup, **kwargs)
    elif page == 'birth':
        #parse_page_birth(soup, month, day):
        return parse_page_birth(soup, **kwargs)
    elif page == 'ethnic':
        #parse_page_ethnic(soup, ethnic):
        return parse_page_ethnic(soup, **kwargs)
    elif page == 'dist':
        return parse_page_dist_stu(soup,'distable')
    elif page == 'stu':
        return parse_page_dist_stu(soup,'studio')
    elif page == 'actor':
        #parse_page_performer(soup, url):
        return parse_page_performer(soup, **kwargs)
    elif page == 'movies':
        #parse_page_movie(soup, href, title)
        return parse_page_movie(soup, **kwargs)
    else:
        logging.warning(f"wrong page: {page}")
        return None
 '''
 #使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
    if load_from_local:     # 从本地读取的逻辑
        html = utils.read_raw_html(url)
        if html:
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(html) if preprocessor else html
            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, http_code_local     # 返回一个小于100的错误码，表明是从本地返回的
    for attempt in range(max_retries):
        try:
            if host_url not in url.lower():
                logging.error(f'wrong url format: {url}')
                return None, http_code_url
            response = scraper.get(url, headers=headers)
            # 处理 HTTP 状态码
            if response.status_code == 404:
                logging.debug(f"Page not found (404): {url}")
                return None, http_code_404  # 直接返回 404，调用方可以跳过
            response.raise_for_status()  # 处理 HTTP 错误
            # 过期的网页，与404相同处理
            if "invalid or outdated page" in response.text.lower():
                logging.debug(f"invalid or outdated page: {url}")
                return None, http_code_404  # 直接返回 404，调用方可以跳过                
            if save_raw_html:
                utils.write_raw_html(url, response.text)
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text
            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, response.status_code
            else:
                # 检查是否发生跳转，比如到登录页面
                if response.history:
                    logging.warning(f"Page redirected on {url}. Validation failed.")
                    return None, http_code_login
            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
        except cloudscraper.exceptions.CloudflareCode1020 as e:
            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
        except Exception as e:
            logging.error(f"Unexpected error on {url}: {e}, Retring...")
    logging.error(f'Fetching failed after max retries. {url}')
    return None, None  # 达到最大重试次数仍然失败
 '''
 # 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
 def preprocess_html(html):
    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
 # 通用的 HTML 结构验证器
 def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name": 
        return bool(soup.find('select', {'name': identifier}))
    return False
 # 检查电影信息是否存在
 def movie_validator(soup, table_id):
    return soup.find("table", id=table_id) is not None
 # 解析 HTML 内容，提取需要的数据
 def parse_page_ethnic_list(soup, href):
    div_root = soup.find("select", id="ethnicity1")
    if not div_root:
        logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
        return None, None
    list_data = []
    # 提取所有的 <option> 标签
    options = div_root.find_all('option')
    if options:
        # 解析并输出 value 和文本内容
        for option in options:
            href = option.get('value', None)
            text = option.text.strip()
            if href and href.lower() == 'none':
                continue
            list_data.append({
                "name": text,
                "href": host_url + href if href else '' 
            })
    return list_data
 # 解析 HTML 内容，提取需要的数据
 def parse_page_astro(soup, astro):
    astro_div = soup.find("div", id="astro")
    if not astro_div:
        logging.warning(f"Warning: No 'astro' div found in {astro}")
        return None, None
    flag = False
    list_cnt = 0
    list_data = []
    next_url = None
    birth_date = None
    for elem in astro_div.find_all(recursive=False):
        if elem.name == "h3" and "astroday" in elem.get("class", []):
            birth_date = elem.get_text(strip=True)
        elif elem.name == "div" and "perficon" in elem.get("class", []):
            a_tag = elem.find("a")
            if a_tag:
                href = host_url + a_tag["href"]
                name = a_tag.find("span", class_="perfname")
                if name:
                    list_data.append({
                        "astrology": astro,
                        "birth_date": birth_date,
                        "person": name.get_text(strip=True),
                        "href": href
                    })
                    flag = True
                    list_cnt = list_cnt +1
    if flag:
        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
        return list_data, next_url
    else:
        return None, None
 # 解析页面内容并更新birth_map
 def parse_page_birth(soup, month, day):
    datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
    if not datarows:
        return None, None
    flag = False
    list_cnt = 0
    list_data = []
    next_url = None
    rows = datarows[0].find_all('div', class_='col-sm-4')
    for row in rows:
        link_tag = row.find('a')
        person = link_tag.text.strip() if link_tag else ''
        href = link_tag['href'] if link_tag else ''
        href = host_url + href
        # 如果 href 已经在 birth_map 中，跳过
        flag = True
        if any(entry['href'] == href for entry in list_data):
            continue
        # 将数据添加到 birth_map
        list_data.append({
            'month': month,
            'day': day,
            'person': person,
            'href': href
        })
        list_cnt = list_cnt +1
    if flag:
        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
        return list_data, next_url
    else:
        return None, None
 # 解析 HTML 内容，提取需要的数据
 def parse_page_ethnic(soup, ethnic):
    rows = soup.find_all('div', class_='row headshotrow')
    flag = False
    list_data = []
    next_url = None
    for row in rows:
        for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
            link_tag = col.find('a')
            img_tag = col.find('div', class_='pictag')
            flag = True
            if link_tag and img_tag:
                href = host_url + link_tag['href']
                person = img_tag.text.strip()
                # 将数据存储到 ethnic_map
                list_data.append({
                    'ethnic': ethnic,
                    'person': person,
                    'href': href
                })
    if flag:
        logging.debug(f"get {len(list_data)} persons from this page.")
        next_page = soup.find('a', rel='next')
        if next_page:
            next_url = host_url + next_page['href']
            logging.debug(f"Found next page: {next_url}")
            return list_data, next_url
        else:
            logging.debug(f"All pages fetched for {ethnic}.")
            return list_data, None
    else:
        return None, None
 # 解析列表页
 def parse_page_dist_stu_list(soup, select_name):
    list_data = []
    next_url = None
    select_element = soup.find('select', {'name': select_name})
    if select_element :    
        options = select_element.find_all('option')
        for option in options:
            value = option.get('value')  # 获取 value 属性
            text = option.text.strip()   # 获取文本内容
            list_data.append({
                'name' : text,
                'href' : str(value)
            })
        return list_data, next_url
    else:
        return None, None
 # 解析 HTML 内容，提取需要的数据
 def parse_page_dist_stu(soup, table_id):
    table = soup.find("table", id=table_id)
    if not table:
        logging.warning(f"Warning: No {table_id} table found ")
        return None, None
    # 找到thead并跳过
    thead = table.find('thead')
    if thead:
        thead.decompose()  # 去掉thead部分，不需要解析
    # 现在只剩下tbody部分
    tbody = table.find('tbody')
    rows = tbody.find_all('tr') if tbody else []
    list_data = []
    next_url = None
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 5:
            title = cols[0].text.strip()
            label = cols[1].text.strip()
            year = cols[2].text.strip()
            rev = cols[3].text.strip()
            a_href = cols[0].find('a')
            href = host_url + a_href['href'] if a_href else ''
            list_data.append({
                'title': title,
                'label': label,
                'year': year,
                'rev': rev,
                'href': href
            })
    return list_data, next_url
 # 解析 作品列表，有个人出演，也有导演的
 def parse_credits_table(table, distributor_list):
    # 找到thead并跳过
    thead = table.find('thead')
    if thead:
        thead.decompose()  # 去掉thead部分，不需要解析
    # 现在只剩下tbody部分
    tbody = table.find('tbody')
    rows = tbody.find_all('tr') if tbody else []
    movies = []
    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
    # rows = table.find_all('tr', class_='we')
    for row in rows:
        #tr_class = row.get('class', '')  # 获取 class 属性，如果没有则返回空字符串
        tr_class = ' '.join(row.get('class', []))  # 获取 class 属性，如果没有则返回空字符串
        cols = row.find_all('td')
        if len(cols) >= 6:
            title = cols[0].text.strip()
            href_a = cols[0].find('a') 
            href = href_a['href'] if href_a else ''
            year = cols[1].text.strip()
            distributor = cols[2].text.strip().lower()
            href_d = cols[2].find('a') 
            href_dist = host_url + href_d['href'] if href_d else ''
            notes = cols[3].text.strip()
            rev = cols[4].text.strip()
            formats = cols[5].text.strip()
            for key in distributor_list:
                if key in distributor:
                    distributor_count[key] += 1
            movies.append({
                'title': title,
                'href' : href,
                'year': year,
                'distributor': distributor,
                'distributor_href': href_dist,
                'notes': notes,
                'rev': rev,
                'formats': formats,
                'tr_class': tr_class
            })
    return movies, distributor_count
 # 请求网页并提取所需数据
 def parse_page_performer(soup, url):
    # 提取数据
    data = {}
    # 定义我们需要的字段名称和HTML中对应的标签
    fields = {
        'performer_aka': 'Performer AKA',
        'birthday': 'Birthday',
        'astrology': 'Astrology',
        'birthplace': 'Birthplace',
        'gender': 'Gender',
        'years_active': 'Years Active',
        'ethnicity': 'Ethnicity',
        'nationality': 'Nationality',
        'hair_colors': 'Hair Colors',
        'eye_color': 'Eye Color',
        'height': 'Height',
        'weight': 'Weight',
        'measurements': 'Measurements',
        'tattoos': 'Tattoos',
        'piercings': 'Piercings'
    }
    reversed_map = {v: k for k, v in fields.items()}
    # 解析表格数据, 获取参演或者导演的列表
    role_list = ['personal', 'directoral']
    distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']        
    credits_list = {}
    # 使用字典来存储统计
    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
    for role in role_list:
        table = soup.find('table', id=role)
        if table :
            movies, stat_map = parse_credits_table(table, distributor_list)
            credits_list[role] = movies
            # 更新 distributor 统计
            for distributor in distributor_list:
                distributor_count[distributor] += stat_map.get(distributor, 0)
    # 统计 movies 数量
    #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
    movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
    # 如果没有找到
    if len(credits_list) == 0 :
        logging.warning(f"movie table empty. url: {url} ")
    # 遍历每个 bioheading, 获取metadata
    bioheadings = soup.find_all('p', class_='bioheading')
    for bio in bioheadings:
        heading = bio.text.strip()
        biodata = None
        # 如果包含 "Performer",需要特殊处理
        if 'Performer' in heading:
            heading = 'Performer AKA'
            biodata_div = bio.find_next('div', class_='biodata')
            if biodata_div:
                div_text = biodata_div.get_text(separator='|').strip()
                biodata = [b.strip() for b in div_text.split('|') if b.strip()]
        else:
            biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
        # 保存数据
        if heading in reversed_map:
            kkey = reversed_map[heading]
            data[kkey] = biodata
    # 添加统计数据到 data
    data['movies_cnt'] = movies_cnt
    data['vixen_cnt'] = distributor_count['vixen']
    data['blacked_cnt'] = distributor_count['blacked']
    data['tushy_cnt'] = distributor_count['tushy']
    data['x_art_cnt'] = distributor_count['x-art']
    data['credits'] = credits_list
    return data
 # 解析网页 HTML 并提取电影信息
 def parse_page_movie(soup, href, title):
    # 解析电影基础信息
    movie_data = {}
    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
    if info_div:
        labels = info_div.find_all("p", class_="bioheading")
        values = info_div.find_all("p", class_="biodata")
        for label, value in zip(labels, values):
            key = label.text.strip()
            if key == "Directors":  # 解析多位导演的情况
                directors = []
                links = value.find_all("a")
                for link in links:
                    director_name = link.text.strip()
                    director_href = host_url + link['href'] if link['href'] else ''
                    directors.append({"name": director_name, "href": director_href})
                movie_data[key] = directors
            else:
                val = value.text.strip()
                if key in ["Distributor", "Studio", "Director"]:
                    link = value.find("a")
                    if link:
                        val = link.text.strip()
                        movie_data[f'{key}Href'] = host_url + link['href']
                movie_data[key] = val
    else:
        return None
    # 解析演职人员信息
    performers = []
    cast_divs = soup.find_all("div", class_="castbox")
    for cast in cast_divs:
        performer = {}
        link = cast.find("a")
        if link:
            performer["name"] = link.text.strip()
            performer["href"] =  host_url + link["href"]
        #performer["tags"] = [
        #    tag.strip() for br in cast.find_all("br")
        #    if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
        #]
        tags = []
        for br in cast.find_all("br"):
            tag = br.next_sibling
            if isinstance(tag, str) and tag.strip():
                tags.append(tag.strip())
        performer["tags"] = tags
        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
        performers.append(performer)
    # 解析场景拆解
    scene_breakdowns = []
    scene_table = soup.find("div", id="sceneinfo")
    if scene_table:
        rows = scene_table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 2:
                scene = cols[0].text.strip()  # 场景编号
                performer_info = cols[1]  # 包含表演者及链接信息
                # 获取 <br> 之前的完整 HTML（保留 <i> 标签等格式）
                performer_html = str(performer_info)  # 获取所有HTML内容
                split_html = performer_html.split("<br/>")  # 按 <br> 进行分割
                if split_html:
                    performers_html = split_html[0].strip()  # 取 <br> 之前的部分
                else:
                    split_html = performer_html.split("<br>")  # 按 <br> 进行分割
                    if split_html:
                        performers_html = split_html[0].strip()  # 取 <br> 之前的部分
                    else:
                        performers_html = performer_html.strip()  # 如果没有 <br>，取全部
                # 解析为纯文本（去除HTML标签，仅提取文本内容）
                performers_soup = BeautifulSoup(performers_html, "html.parser")
                performers_text = performers_soup.get_text()
                # 提取表演者
                scene_performers = [p.strip() for p in performers_text.split(",")]
                # 尝试获取 `webscene` 和 `studio`
                links_data = {}
                links = performer_info.find_all("a")
                if links:
                    webscene_title = links[0].text.strip() if len(links)>0 else None
                    webscene = links[0]["href"] if len(links)>0 else None
                    studio = links[1].text.strip() if len(links)>1 else None
                    studio_lnk = links[1]["href"] if len(links)>1 else None
                    links_data = {
                        "title": webscene_title,
                        "webscene": webscene,
                        "studio": studio,
                        "studio_lnk": studio_lnk,
                    }
                scene_data = {
                    "scene": scene,
                    "performers": scene_performers,
                    **links_data,
                }
                scene_breakdowns.append(scene_data)
    appears_in = []
    appears_divs = soup.find("div", id="appearssection")
    if appears_divs:
        rows = appears_divs.find_all("li")
        for row in rows:
            lnk = row.find("a")
            if lnk:
                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
    return {
        "href": href,
        "title": title,
        "Minutes": movie_data.get("Minutes", ""),
        "Distributor": movie_data.get("Distributor", ""),
        "Studio": movie_data.get("Studio", ""),
        "ReleaseDate": movie_data.get("Release Date", ""),
        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
        "All-Girl": movie_data.get("All-Girl", ""),
        "All-Male": movie_data.get("All-Male", ""),
        "Compilation": movie_data.get("Compilation", ""),
        "Webscene": movie_data.get("Webscene", ""),
        "Director": movie_data.get("Director", ""),
        "DirectorHref": movie_data.get("DirectorHref", ""),
        "DistributorHref": movie_data.get("DistributorHref", ""),
        "StudioHref": movie_data.get("StudioHref", ""),
        "Directors": movie_data.get("Directors", []),   # 可能存在的元素
        "Performers": performers,
        "SceneBreakdowns": scene_breakdowns,
        "AppearsIn": appears_in,
    }
 if __name__ == "__main__":
    for astro in astro_list:
        url = astr_base_url + astro
        next_url = url
        logging.info(f"Fetching data for {astro}, url {url} ...")
        while True:
            soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
            if soup:
                list_data, next_url = parse_page_astro(soup, astro)
                if list_data:
                    print(list_data[0] if len(list_data)>0 else 'no data')
                    break
            else:
                logging.info(f"Retrying {next_url} ...")
                time.sleep(5)  # 等待后再重试
        time.sleep(2)  # 控制访问频率
--- a/scrapy_proj/scrapy_proj/utils/utils.py
+++ b/scrapy_proj/scrapy_proj/utils/utils.py
@ -129,3 +129,11 @@ def replace_lang_param(url: str) -> str:
    )
    return urlunparse(new_parsed)
 def pretty_json_simple(item):
    try:
        # 转换为单行JSON格式，需要保证传入的是map，不能是list
        return json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
    except:
        # 转换失败时返回原始字符串
        return item