modify scripts

2025-07-24 19:13:56 +08:00
parent cc6530d73a
commit 50d829364b
10 changed files with 1289 additions and 128 deletions
--- a/scrapy_proj/cron/cron_scheduler.sh
+++ b/scrapy_proj/cron/cron_scheduler.sh
@ -133,8 +133,9 @@ fi

 # 每月任务
 if [ "${PERIOD}" = "--monthly" ]; then
-    register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
-    register_spider "pbox" "scrapy crawl javhd -a mod='update' "
+    register_spider "pbox"  "scrapy crawl pbox  -a begin=${COMMON_DATE_PARAM} -a mod='update' "
+    register_spider "javhd" "scrapy crawl javhd -a mod='update' "
+    register_spider "lord"  "scrapy crawl lord  -a mod='update' "
 fi


--- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
@ -6,6 +6,7 @@ from datetime import datetime
 from typing import List, Dict
 from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
 import scrapy_proj.comm.comm_def as comm
+from scrapy_proj.utils.utils import pretty_json_simple

 # 注册器字典
 spider_handler_registry = {}
@ -609,3 +610,61 @@ class JavHDDBHandler(SQLiteDBHandler):
        except sqlite3.Error as e:
            logging.error(f"query error: {e}")
            return 0
+
+
+@register_handler(comm.SPIDER_NAME_LORD)
+class LordDBHandler(SQLiteDBHandler):
+    def __init__(self, db_path=shared_db_path):
+        super().__init__(db_path)
+        self.tbl_name_actors = 'thelordofporn_actress'
+        self.tbl_name_alias  = 'thelordofporn_alias'
+
+    def insert_item(self, item):
+        if item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL:
+            self.insert_actor(item)
+        else:
+            logging.error(f"unkown item.") 
+
+        return item
+
+    def insert_actor(self, item):
+        actor_id = self.insert_or_update_common(item, self.tbl_name_actors, uniq_key='href', exists_do_nothing=False)
+        if actor_id:
+            for alias in item.get('alias', []):
+                alias_data = {'actress_id':actor_id, 'alias':alias}
+                affected_rows = self.insert_or_update_with_composite_pk(data=alias_data, tbl_name=self.tbl_name_alias, composite_pk=['actress_id','alias'], exists_do_nothing=False)
+                if affected_rows:
+                    logging.debug(f"insert/update actress_alias. data: {alias_data}")
+                else:
+                    logging.warning(f"insert actor alias error!. data: {alias_data}")
+        else:
+            logging.warning(f"insert actor data error! data: {pretty_json_simple(item)}")
+
+    # 统计函数
+    def get_stat(self):
+        try:
+            self.cursor.execute(f"""
+                SELECT
+                    (SELECT COUNT(*) FROM {self.tbl_name_actors}) AS actor_cnt
+            """)
+            
+            row = self.cursor.fetchone()
+            if not row:
+                logging.warning(f"query no results.")
+                return {}
+                
+            columns = [desc[0] for desc in self.cursor.description]
+            return dict(zip(columns, row))
+            
+        except sqlite3.Error as e:
+            logging.error(f"query error: {e}")
+            return {}
+
+    def has_full_data(self, href):
+        try:
+            self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_actors} WHERE is_full_data=1 and href = ?", (href,))
+            row = self.cursor.fetchone()
+            return row[0] if row else None
+        except sqlite3.Error as e:
+            logging.error(f"query error: {e}")
+            return 0
--- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
@ -188,6 +188,68 @@ class SQLiteDBHandler(metaclass=SingletonMeta):  # 应用单例元类
            logging.error(f"Error inserting or updating data: {e}")
            return None

+    def insert_or_update_with_composite_pk(self, data, tbl_name, composite_pk, exists_do_nothing=True):
+        """
+        针对联合主键表执行插入或更新操作
+        
+        :param table_name: 表名
+        :param data: 字典类型，待插入或更新的数据
+        :param composite_pk: 列表类型，联合主键字段名集合
+        :param need_update: 布尔值，记录存在时是否更新，默认True
+        :return: 操作影响的行数
+        """
+        try:
+            # 校验联合主键参数有效性
+            if not isinstance(composite_pk, list) or len(composite_pk) < 2:
+                logging.error(f"联合主键必须是包含至少两个字段的列表: {composite_pk}")
+                return None
+            
+            processed_data = self.check_and_process_data(data, tbl_name)
+
+            # 校验联合主键字段是否都在数据中存在
+            for pk_field in composite_pk:
+                if pk_field not in processed_data:
+                    logging.error(f"联合主键字段 '{pk_field}' 未在数据中提供")
+                    return None
+
+            # 构建查询条件
+            where_conditions = " AND ".join([f"{pk} = ?" for pk in composite_pk])
+            pk_values = [processed_data[pk] for pk in composite_pk]
+            
+            # 检查记录是否存在
+            self.cursor.execute(
+                f"SELECT 1 FROM {tbl_name} WHERE {where_conditions}",
+                pk_values
+            )
+            exists = self.cursor.fetchone() is not None
+
+            if exists:
+                if exists_do_nothing:
+                    return 0
+                
+                # 构建更新字段（排除联合主键字段）
+                update_fields = [f for f in processed_data.keys() if f not in composite_pk]
+                if not update_fields:
+                    return 0
+                
+                set_clause = ", ".join([f"{field} = ?" for field in update_fields])
+                update_values = [processed_data[field] for field in update_fields] + pk_values
+                
+                # 执行更新（兼容低版本SQLite的标准语法）
+                update_sql = f"UPDATE {tbl_name} SET {set_clause} WHERE {where_conditions}"
+                self.cursor.execute(update_sql, update_values)
+                return 1
+            else:
+                # 执行插入操作
+                columns = ", ".join(processed_data.keys())
+                placeholders = ", ".join(["?" for _ in processed_data.keys()])
+                insert_sql = f"INSERT INTO {tbl_name} ({columns}) VALUES ({placeholders})"
+                self.cursor.execute(insert_sql, list(processed_data.values()))
+                return 2
+        except sqlite3.Error as e:
+            logging.error(f"Error inserting or updating data: {e}")
+            return None
+
    def get_id_by_key(self, tbl, uniq_key, val):
        self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
        row = self.cursor.fetchone()
--- a/scrapy_proj/scrapy_proj/items.py
+++ b/scrapy_proj/scrapy_proj/items.py
@ -192,4 +192,33 @@ class JavHDActorItem(scrapy.Item):
    birth_date = scrapy.Field()
    ethnicity = scrapy.Field()
    birth_place = scrapy.Field()
-    is_full_data = scrapy.Field()
+    is_full_data = scrapy.Field()
+
+
+class LordActorItem(scrapy.Item):
+    item_type = scrapy.Field()
+    pornstar = scrapy.Field()
+    rating = scrapy.Field()
+    rank = scrapy.Field()
+    votes = scrapy.Field()
+    href = scrapy.Field()
+    career_start = scrapy.Field()
+    measurements = scrapy.Field()
+    born = scrapy.Field()
+    height = scrapy.Field()
+    weight = scrapy.Field()
+    date_modified = scrapy.Field()
+    global_rank = scrapy.Field()
+    weekly_rank = scrapy.Field()
+    last_month_rating = scrapy.Field()
+    current_rating = scrapy.Field()
+    total_votes = scrapy.Field()
+    birth_date = scrapy.Field()
+    birth_year = scrapy.Field()
+    birth_place = scrapy.Field()
+    height_ft = scrapy.Field()
+    height_cm = scrapy.Field()
+    weight_lbs = scrapy.Field()
+    weight_kg = scrapy.Field()
+    is_full_data = scrapy.Field()
+    alias = scrapy.Field()
--- a/scrapy_proj/scrapy_proj/spiders/base_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/base_spider.py
@ -31,7 +31,7 @@ class BaseSpider(scrapy.Spider):
                yield request
    
    def parse(self, response):
-        """统一的响应处理入口"""
+        """统一的响应处理入口，实际上没有起作用，因为直接走了 scrapy.Request 里的 callback """
        # 记录请求耗时
        request_time = response.meta.get('request_time')
        if request_time:
--- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
@ -1,15 +1,19 @@
 import scrapy
 import re
+import sys
+from urllib.parse import urljoin, quote_plus
 from scrapy_proj.spiders.base_spider import BaseSpider
 from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
 from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
 from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
+from scrapy_proj.spiders.parser.iafd_parser import common_parser
+from scrapy_proj.utils.utils import pretty_json_simple

 db_tools = IAFDDBHandler()

 class IAFDSpider(BaseSpider):
    name = SPIDER_NAME_IAFD
-    allowed_domains = ["iafd.com"]
+    allowed_domains = ["iafd.com", "www.iafd.com"]

    host_url = "https://www.iafd.com"
    astr_base_url = f"{host_url}/astrology.rme/sign="
@ -19,10 +23,10 @@ class IAFDSpider(BaseSpider):
    studios_list_url = f"{host_url}/studio.asp"
    ethnic_list_url = f'{host_url}/advsearch.asp'

-    def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
+    def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
-        self.update = int(update)
+        self.update_mode = True if mod and mod.lower() == 'update' else False
        self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")

        self.cmd_astro = 'astro'
@ -64,8 +68,9 @@ class IAFDSpider(BaseSpider):
        query_args = {}
        if self.debug:
            query_args['limit'] = 5
-        if self.update == 0:
+        if self.update_mode:
            query_args['is_full_data'] = 0
+        query_args['is_full_data'] = 404

        # 读取待更新的演员列表
        if self.cmd_performers in self.cmd_list:
@ -77,7 +82,7 @@ class IAFDSpider(BaseSpider):
                    href = item.get('href', '')
                    movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
                    self.logger.info(f"fetch from db. item: {item}")
-                    yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
+                    yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'})

        # 读取待更新的影片列表
        if self.cmd_movies in self.cmd_list:
@ -88,7 +93,7 @@ class IAFDSpider(BaseSpider):
                for item in movies:
                    href = item.get('href', '')
                    self.logger.info(f"fetch from db. item: {item}")
-                    yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
+                    yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'})


    def start_astro(self):
@ -113,50 +118,28 @@ class IAFDSpider(BaseSpider):
            yield request

    def parse_astro_page(self, response):
-        astro = response.meta['astro']
-        astro_div = response.css('div#astro')
-        if astro_div:
-            birth_date = None
-            for elem in astro_div.css('*'):
-                if elem.css('h3.astroday'):
-                    birth_date = elem.css('h3.astroday::text').get().strip()
-                elif elem.css('div.perficon'):
-                    a_tag = elem.css('a')
-                    if a_tag:
-                        href = self.host_url + a_tag.attrib['href']
-                        name = a_tag.css('span.perfname::text').get()
-                        if name:
-                            item = IAFDPersonItem()
-                            item['name'] = name
-                            item['href'] = href
-                            item['from_astro_list'] = 1
-                            item['from_birth_list'] = 0
-                            item['from_ethnic_list'] = 0
-                            item['from_movie_list'] = 0
-                            yield item
-                            #yield scrapy.Request(href, callback=self.parse_person_detail_page)
+        astro = response.meta.get('astro', '')
+        data, next_url = common_parser(html=response.text, page='astro', astro=astro)
+        if data:
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
+        else:
+            self.logger.warning(f"parse data error. {response.url}")

+        item = IAFDPersonDetailItem()
+        #yield item
+        
    def parse_birth_page(self, response):
        month = response.meta['month']
        day = response.meta['day']
-        datarows = response.css('div.col-sm-12.col-lg-9')
-        if datarows:
-            rows = datarows[0].css('div.col-sm-4')
-            for row in rows:
-                link_tag = row.css('a')
-                person = link_tag.css('::text').get().strip() if link_tag else ''
-                href = self.host_url + link_tag.attrib['href'] if link_tag else ''
-
-                item = IAFDPersonItem()
-                item['name'] = person
-                item['href'] = href
-                item['from_astro_list'] = 0
-                item['from_birth_list'] = 1
-                item['from_ethnic_list'] = 0
-                item['from_movie_list'] = 0
-                yield item
-                #yield scrapy.Request(href, callback=self.parse_person_detail_page)
+        data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
+        if data:
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
+        else:
+            self.logger.warning(f"parse data error. {response.url}")

+        item = IAFDPersonDetailItem()
+        #yield item
+        
    def parse_ethnic_list_page(self, response):
        div_root = response.css('select#ethnicity1')
        if div_root:
@ -167,40 +150,25 @@ class IAFDSpider(BaseSpider):
                href = option.attrib.get('value')
                text = option.css('::text').get().strip()
                if href and href.lower() != 'none':
-                    ethnic_url = self.host_url + href
+                    ethnic_url = urljoin(response.url , href)
+                    self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
                    yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
-                    if self.debug:
-                        break

    def parse_ethnic_page(self, response):
        ethnic = response.meta['ethnic']
-        rows = response.css('div.row.headshotrow')
-        for row in rows:
-            cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
-            for col in cols:
-                link_tag = col.css('a')
-                img_tag = col.css('div.pictag')
-                if link_tag and img_tag:
-                    href = self.host_url + link_tag.attrib['href']
-                    person = img_tag.css('::text').get().strip()
-
-                    item = IAFDPersonItem()
-                    item['name'] = person
-                    item['href'] = href
-                    item['from_astro_list'] = 0
-                    item['from_birth_list'] = 0
-                    item['from_ethnic_list'] = 1
-                    item['from_movie_list'] = 0
-                    yield item
-                    #yield scrapy.Request(href, callback=self.parse_person_detail_page)
-
-        next_page = response.css('a[rel="next"]')
-        if next_page:
-            next_url = self.host_url + next_page.attrib['href']
-            yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
+        data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
+        if data:
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
        else:
-            self.crawler.stats.inc_value(f"{self.name}/ethnic_done")
-            self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}")
+            self.logger.warning(f"parse data error. {response.url}")
+
+        if next_url:
+            self.logger.info(f"find next page: {next_url}")
+        else:
+            self.logger.info(f"found all pages. url: {response.url}")
+
+        item = IAFDPersonDetailItem()
+        #yield item

    def parse_distributors_list_page(self, response):
        select_element = response.css('select[name="Distrib"]')
@ -209,16 +177,8 @@ class IAFDSpider(BaseSpider):
            for option in options:
                value = option.attrib.get('value')
                text = option.css('::text').get().strip()
-                dis_url = self.host_url + f"/distrib.rme/distrib={value}"
-                item = IAFDMovieItem()
-                item['title'] = text
-                item['href'] = dis_url
-                item['release_year'] = 0
-                item['from_performer_list'] = 0
-                item['from_dist_list'] = 1
-                item['from_stu_list'] = 0
-                yield item
-                #yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
+                dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
+                yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})

    def parse_studios_list_page(self, response):
        select_element = response.css('select[name="Studio"]')
@ -227,47 +187,54 @@ class IAFDSpider(BaseSpider):
            for option in options:
                value = option.attrib.get('value')
                text = option.css('::text').get().strip()
-                stu_url = self.host_url + f"/studio.rme/studio={value}"
-                item = IAFDMovieItem()
-                item['title'] = text
-                item['href'] = stu_url
-                item['release_year'] = 0
-                item['from_performer_list'] = 0
-                item['from_dist_list'] = 0
-                item['from_stu_list'] = 1
-                yield item
-                #yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
+                dis_url = f"{self.host_url}/studio.rme/studio={value}"
+                yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
+
+    def parse_stu_dist_page(self, response):
+        list_type = response.meta.get('list_type', '')
+        data, next_url = common_parser(html=response.text, page=list_type)
+        if data:
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
+        else:
+            self.logger.warning(f"fetched data error. {response.url}")
+
+        item = IAFDPersonDetailItem()
+        #yield item
+

    def parse_person_detail_page(self, response):
+        data = common_parser(html=response.text, page='actor', url=response.url)
+        if data:
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
+        else:
+            self.logger.warning(f"fetched data error. {response.url}")
+
        item = IAFDPersonDetailItem()
-        item['href'] = response.url
-        item['person'] = response.css('h1::text').get()  # 假设姓名在 h1 标签中
-        # 解析其他详细信息，根据实际页面结构修改
-        item['gender'] = response.css('span.gender::text').get()
-        item['birthday'] = response.css('span.birthday::text').get()
-        item['astrology'] = response.css('span.astrology::text').get()
-        item['birthplace'] = response.css('span.birthplace::text').get()
-        item['years_active'] = response.css('span.years_active::text').get()
-        item['ethnicity'] = response.css('span.ethnicity::text').get()
-        item['nationality'] = response.css('span.nationality::text').get()
-        item['hair_colors'] = response.css('span.hair_colors::text').get()
-        item['eye_color'] = response.css('span.eye_color::text').get()
-        item['height'] = response.css('span.height::text').get()
-        item['weight'] = response.css('span.weight::text').get()
-        item['measurements'] = response.css('span.measurements::text').get()
-        item['tattoos'] = response.css('span.tattoos::text').get()
-        item['piercings'] = response.css('span.piercings::text').get()
-        item['movies_cnt'] = response.css('span.movies_cnt::text').get()
-        item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
-        item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
-        item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
-        item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
-        item['performer_aka'] = response.css('span.performer_aka::text').getall()
-        yield item
+        #yield item

    def parse_movie_detail_page(self, response):
+        title = response.meta.get('title', '')
+        data = common_parser(html=response.text, page='movies', href=response.url, title=title)
+        if data:
+            self.logger.debug(f"fetched data from {response.url}, data: {data}")
+        else:
+            self.logger.warning(f"fetched data error. {response.url}")
+
        item = IAFDMovieDetailItem()
-        item['title'] = response.css('h1::text').get()  # 假设标题在 h1 标签中
-        item['href'] = response.url
-        # 解析其他详细信息，根据实际页面结构修改
-        yield item
+        #yield item
+    
+    def custom_block_check(self, response):
+        item_type = response.meta.get('item_type', '')
+        if "invalid or outdated page" in response.text.lower():
+            self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
+            return "invalid or outdated page"
+        else:
+            self.logger.info(f"right content. url: {response.url}")
+            
+        return None            
+
+    # 处理页面异常，主要是404, 403
+    def handle_blocked(self, response, reason):
+        item_type = response.meta.get('item_type', '')
+        if response.status in [404, 403]:
+            self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
--- a/scrapy_proj/scrapy_proj/spiders/javhd_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/javhd_spider.py
@ -111,7 +111,7 @@ class JavhdSpider(BaseSpider):
            item['rank'] = rank
            item['url'] = url
            item[f'{lang}_name'] = name
-            #TODO: 非英语的页面，要去更新对应的名字
+            # 非英语的页面，要去更新对应的名字
            if lang != 'en':
                item['url'] = replace_lang_param(item['url'])
                yield item 
@ -127,7 +127,7 @@ class JavhdSpider(BaseSpider):
                        meta={"list_item": item}  # 传递列表页数据到详情页
                    )
                else:
-                    self.logger.info(f"actor(name) has full data. skip. url: {url}")
+                    self.logger.info(f"actor({name}) has full data. skip. url: {url}")

        # 获取下一页
        next_path = data.get("pagination_params", {}).get("next")
--- a/scrapy_proj/scrapy_proj/spiders/lord_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/lord_spider.py
@ -0,0 +1,399 @@
+import scrapy
+import sys
+import re
+from urllib.parse import urljoin, quote_plus
+from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param, pretty_json_simple
+from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
+from scrapy_proj.items import LordActorItem
+from scrapy_proj.comm.comm_def import SPIDER_NAME_LORD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL
+from scrapy_proj.db_wapper.spider_db_handler import LordDBHandler
+
+db_tools = LordDBHandler()
+
+class LordSpider(BaseSpider):
+    name = SPIDER_NAME_LORD
+    allowed_domains = ["www.thelordofporn.com", "thelordofporn.com"]
+    
+    # 配置请求头（复用curl中的头部信息）
+    custom_settings = {
+        "DEFAULT_REQUEST_HEADERS": {
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+            "if-modified-since": "Wed, 23 Jul 2025 14:34:28 GMT",
+            "priority": "u=0, i",
+            "sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
+            "sec-ch-ua-mobile": "?0",
+            "sec-ch-ua-platform": "\"macOS\"",
+            "sec-fetch-dest": "document",
+            "sec-fetch-mode": "navigate",
+            "sec-fetch-site": "none",
+            "sec-fetch-user": "?1",
+            "upgrade-insecure-requests": "1",
+            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0"
+        },
+        "COOKIES_ENABLED": True  # 启用Cookie支持
+    }
+ 
+    def __init__(self, debug='false', mod='update', *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
+        self.update_mod = False if mod and mod.lower() == 'force' else  True
+
+        self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") 
+
+    # 入口函数，由基类的方法触发
+    def custom_start_requests(self):    
+        url = 'https://thelordofporn.com/pornstars/'
+        yield scrapy.Request(
+            url=url,
+            headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),  # 使用GET头
+            callback=self.parse_list,
+            meta={}  # 传递列表页数据到详情页
+        )
+
+    def parse_list(self, response):
+        # 提取所有演员条目（对应原代码中的article.loop-item）
+        articles = response.css("article.loop-item")
+        self.logger.info(f"当前页({response.url})找到 {len(articles)} 个演员条目")
+        
+        for article in articles:
+            try:
+                # 提取演员名称和详情页链接
+                title_tag = article.css("h3.loop-item__title a")
+                title = title_tag.css("::text").get(default="N/A").strip()
+                href = title_tag.attrib.get("href")  # 获取a标签的href属性
+                
+                # 提取评分
+                rating = article.css("div.loop-item__rating::text").get(default="N/A").strip()
+                
+                # 提取排名和投票数（对应原代码中的meta_tags）
+                meta_tags = article.css("div.loop-item__rank span")
+                rank = None
+                votes = None
+                
+                # 解析排名（第一个span中的b标签）
+                if len(meta_tags) >= 1:
+                    rank_b = meta_tags[0].css("b::text").get()
+                    rank = rank_b.strip() if rank_b else "N/A"
+                
+                # 解析投票数（第二个span中的b标签）
+                if len(meta_tags) >= 2:
+                    votes_b = meta_tags[1].css("b::text").get()
+                    votes = votes_b.strip() if votes_b else "N/A"
+                
+                # 转换为数值类型（模拟原代码中的utils.parse_numeric）
+                def parse_numeric(value):
+                    if not value or value == "N/A":
+                        return None
+                    # 移除非数字字符（如逗号、%等）
+                    numeric_str = ''.join(filter(str.isdigit, value))
+                    return int(numeric_str) if numeric_str else None
+                
+                # 构建演员数据字典
+                actress_data = {
+                    "pornstar": title,
+                    "rating": parse_numeric(rating),
+                    "rank": parse_numeric(rank),
+                    "votes": parse_numeric(votes),
+                    "href": href if href else None
+                }
+                # 发起详情查询
+                actor_exists = 0 if not self.update_mod else db_tools.has_full_data(href)
+                if actor_exists < 1  :                
+                    yield scrapy.Request(
+                        url=href,
+                        callback=self.parse_actor_detail,
+                        headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
+                        meta = {'actor':actress_data}
+                    )
+                else:
+                    self.logger.info(f"actor({title}) has full data. skip. url: {href}")
+                
+            except Exception as e:
+                self.logger.error(f"解析演员条目失败: {e}, 页面: {response.url}")
+                continue  # 跳过错误条目，继续解析下一个
+
+        # 提取下一页链接（对应原代码中的.next.page-numbers）
+        next_page_url = None
+        next_page_tag = response.css(".nav-links .next.page-numbers")
+        if next_page_tag:
+            next_page_href = next_page_tag.attrib.get("href")
+            if next_page_href and not self.debug:
+                # 拼接完整URL（处理相对路径）
+                next_page_url = urljoin(response.url, next_page_href)
+                yield scrapy.Request(
+                    url=next_page_url,
+                    callback=self.parse_list,
+                    headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
+                    meta = {}
+                )
+        else:
+            self.logger.info(f"已解析所有页面, current url: {response.url}")
+
+    def parse_actor_detail(self, response):
+        # 1. 定义字段映射表：页面原始字段 -> Item字段
+        FIELD_MAPPING = {
+            # 基本信息
+            'date_modified': 'date_modified',
+            # 排名信息
+            'Global Rank': 'global_rank',
+            'Weekly Rank': 'weekly_rank',
+            # 评分信息
+            'Last Month': 'last_month_rating',
+            'Rating Av.': 'current_rating',
+            'Total of Votes': 'total_votes',
+            # 详细属性
+            'Career start': 'career_start',
+            'Measurements': 'measurements',
+            'Born': 'born',
+            'Height': 'height',
+            'Weight': 'weight',
+            'Name': 'alias_raw',  # 别名对应Name字段
+            # 解析后字段（出生/身高/体重）
+            'birth_date': 'birth_date',
+            'birth_year': 'birth_year',
+            'birth_place': 'birth_place',
+            'height_ft': 'height_ft',
+            'height_cm': 'height_cm',
+            'weight_lbs': 'weight_lbs',
+            'weight_kg': 'weight_kg',
+            'alias':'alias'
+        }
+
+        # 2. 初始化原始数据容器
+        raw_data = {}
+        # 3. 提取基础信息
+        raw_data['href'] = response.url
+        entry_header = response.css("header.entry-header")
+        raw_data['name'] = entry_header.css("h1.entry-title::text").get(default="").strip()
+        raw_data['date_modified'] = entry_header.css("time[itemprop='dateModified']::attr(content)").get(default="").strip()
+
+        # 4. 提取排名信息
+        for item in entry_header.css("div.porn-star-rank__item"):
+            item_text = item.css("::text").get(default="").strip()
+            raw_data[item_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
+
+        # 5. 提取评分和投票信息
+        for item in response.css("div.specifications__item--horizontal"):
+            # 1. 精准定位标题区域（排除b标签）
+            # 情况1：有子div的结构（如Rating Av.带img）
+            title_div = item.css("div:first-child")
+            if title_div:
+                # 只提取子div内的文本（自动排除同级的b标签）
+                title_parts = title_div.css("::text").getall()
+            else:
+                # 情况2和3：无子div的结构（Last Month和Total of Votes）
+                # 提取当前item内所有文本，但排除b标签的内容
+                all_text_parts = item.css("::text").getall()
+                b_text_parts = item.css("b::text").getall()
+                # 从所有文本中移除b标签的文本
+                title_parts = [t for t in all_text_parts if t not in b_text_parts]
+            
+            # 2. 清理标题文本（处理非断空格和空白）
+            title_text = "".join(title_parts)
+            title_text = title_text.replace(u'\xa0', u' ')  # 替换非断空格
+            title_text = re.sub(r'\s+', ' ', title_text).strip()  # 合并空白
+
+            raw_data[title_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
+
+        # 6. 提取详细属性（specifications-grid-row）
+        for row in response.css("div.specifications-grid-row"):
+            items = row.css("div.specifications-grid-item")
+            for i in [0, 1]:  # 处理每行2个属性
+                if i < len(items):
+                    label = extract_text_from_element(items[i].css("h5"))
+                    value = extract_text_from_element(items[i].css("span"))
+                    if label:
+                        raw_data[label] = value
+
+        # 7. 处理特殊字段（别名需要清洗）
+        raw_data['alias'] = self.clean_alias(raw_data.get("Name", ""))
+
+        # 9. 解析出生信息、身高、体重并合并
+        raw_data.update(self.parse_birth_info(raw_data.get("Born", "")))
+        raw_data.update(self.parse_height(raw_data.get("Height", "")))
+        raw_data.update(self.parse_weight(raw_data.get("Weight", "")))
+
+        # 10. 映射到Item并返回
+        item = LordActorItem()
+        item['item_type'] = ITEM_TYPE_ACTOR_DETAIL
+        actor_data = response.meta['actor']
+        for k, v in actor_data.items():
+            if k in item.fields:
+                item[k] = v
+
+        for raw_field, item_field in FIELD_MAPPING.items():
+            if item_field in item.fields:
+                item[item_field] = raw_data.get(raw_field, "")
+
+        # 标记为完整数据
+        item['is_full_data'] = 1
+        self.logger.info(f"actor data: {raw_data}, meta: {response.meta['actor']}, item: {pretty_json_simple(item)}")
+        
+        yield item
+
+    # 保留原工具函数（需作为Spider类的方法）
+    def parse_birth_info(self, text):
+        match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text, re.IGNORECASE)
+        if match:
+            return {
+                "birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
+                "birth_year": match.group(3),
+                "birth_place": match.group(4),
+            }
+        return {"birth_date": text, "birth_year": "", "birth_place": ""}
+
+
+    def parse_height2(self, text):
+        match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text, re.IGNORECASE)
+        if match:
+            height_ft = f"{match.group(1)}'{match.group(2)}\""
+            return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
+        return {"height_ft": text, "height_cm": ""}
+    def parse_height(self, text):
+        # 统一预处理：替换逗号为小数点，处理常见笔误（如'n'→'in'）
+        text = text.replace(',', '.').replace(' n ', ' in ').strip()
+        
+        # 正则表达式：匹配所有英尺+英寸格式（支持多种表达方式）
+        # 分组说明：
+        # 1. 英尺数值  2. 英尺单位（feet/ft/ft./'）  3. 英寸数值  4. 英寸单位（inches/in/in./inch/"）
+        # 5. 厘米/米数值  6. 单位（cm/m）
+        pattern = r"""
+            # 情况1：先英尺英寸，后厘米/米（主流格式）
+            (?:(\d+)\s*(feet|ft\.?|')\s*)                  # 英尺部分（如5ft/5'）
+            (?:and\s*)?                                    # 可选的"and"（如5 feet and 2 inches）
+            (\d+)\s*(inches|in\.?|inch|")?\s*              # 英寸部分（如2in/2"）
+            (?:\(?(\d+\.?\d*)\s*(cm|m)\)?)                 # 厘米/米部分（如(157cm)/(1.57m)）
+            
+            |                                               # 或
+            
+            # 情况2：先厘米，后英尺英寸（如170 cm / 5 feet and 7 inches）
+            (\d+)\s*cm\s*/\s*                              # 厘米在前
+            (?:(\d+)\s*(feet|ft\.?|')\s*)                  # 英尺部分
+            (?:and\s*)?
+            (\d+)\s*(inches|in\.?|inch|")?                 # 英寸部分
+            
+            |                                               # 或
+            
+            # 情况3：纯简写格式（如5'3" (160 cm)）
+            (\d+)'(\d+)"\s*\(?(\d+)\s*cm\)?                # 5'3"格式
+        """
+        
+        # 使用VERBOSE忽略正则中的空格，IGNORECASE忽略大小写
+        match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
+        if not match:
+            # 处理纯厘米格式（如"160cm"）
+            cm_match = re.match(r'(\d+)\s*cm', text, re.IGNORECASE)
+            if cm_match:
+                return {"height_ft": "", "height_cm": cm_match.group(1)}
+            return {"height_ft": text, "height_cm": ""}
+        
+        # 提取匹配结果（根据不同情况处理分组）
+        ft = None
+        inch = None
+        cm = None
+        
+        # 情况1：先英尺英寸后厘米/米
+        if match.group(1) and match.group(3):
+            ft = match.group(1)
+            inch = match.group(3)
+            num = match.group(5)
+            unit = match.group(6).lower() if match.group(6) else 'cm'
+        
+        # 情况2：先厘米后英尺英寸
+        elif match.group(7):
+            cm = match.group(7)
+            ft = match.group(8)
+            inch = match.group(10)
+            unit = 'cm'  # 情况2中前面的单位固定为cm
+        
+        # 情况3：纯简写格式（5'3"）
+        elif match.group(11) and match.group(12):
+            ft = match.group(11)
+            inch = match.group(12)
+            cm = match.group(13)
+            unit = 'cm'
+        
+        # 处理厘米/米转换（米转厘米）
+        if not cm and num and unit:
+            if unit == 'm':
+                cm = str(int(float(num) * 100))  # 1.57m → 157cm
+            else:
+                cm = num  # 直接使用cm数值
+        
+        # 格式化英尺英寸表达式（如5'2"）
+        height_ft = f"{ft}'{inch}\"" if ft and inch else ""
+        
+        return {"height_ft": height_ft.strip(), "height_cm": cm.strip() if cm else ""}
+
+
+    def parse_weight2(self, text):
+        match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text, re.IGNORECASE)
+        if match:
+            return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
+        return {"weight_lbs": text, "weight_kg": ""}
+
+    def parse_weight(self, text):
+        # 预处理：清理空格和常见格式问题
+        text = text.strip().replace('  ', ' ')
+        
+        # 正则表达式：匹配多种体重格式
+        # 分组说明：
+        # 1. 磅数值  2. 磅单位(lb/lbs/pounds)  3. 千克数值  4. 千克单位(kg)
+        # 5. 千克在前的数值  6. 千克单位  7. 磅在后的数值  8. 磅单位
+        pattern = r"""
+            # 情况1：磅在前，千克在后（主流格式）
+            (?:(\d+)\s*(lb|lbs|pounds)?\s*)               # 磅部分（支持lb/lbs/pounds或省略单位）
+            (?:\(?\s*(\d+)\s*(kg)\s*\)?)                 # 千克部分（如(45 kg)）
+            
+            |                                             # 或
+            
+            # 情况2：千克在前，磅在后（如52 kg / 114 lbs）
+            (?:(\d+)\s*(kg)\s*/\s*)                       # 千克部分
+            (\d+)\s*(lb|lbs|pounds)?                      # 磅部分
+        """
+        
+        # 使用VERBOSE和IGNORECASE标志增强兼容性
+        match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
+        if not match:
+            # 尝试匹配纯千克格式（如"52kg"）
+            kg_match = re.match(r'(\d+)\s*kg', text, re.IGNORECASE)
+            if kg_match:
+                return {"weight_lbs": "", "weight_kg": kg_match.group(1)}
+            
+            # 尝试匹配纯磅格式（如"114lb"）
+            lb_match = re.match(r'(\d+)\s*(lb|lbs|pounds)', text, re.IGNORECASE)
+            if lb_match:
+                return {"weight_lbs": lb_match.group(1), "weight_kg": ""}
+            
+            # 完全无法解析的情况
+            return {"weight_lbs": text, "weight_kg": ""}
+        
+        # 提取匹配结果（根据不同情况处理分组）
+        weight_lbs = None
+        weight_kg = None
+        
+        # 情况1：磅在前，千克在后
+        if match.group(1) and match.group(3):
+            weight_lbs = match.group(1)
+            weight_kg = match.group(3)
+        
+        # 情况2：千克在前，磅在后
+        elif match.group(5) and match.group(6):
+            weight_kg = match.group(5)
+            weight_lbs = match.group(7)
+        
+        return {
+            "weight_lbs": weight_lbs.strip() if weight_lbs else "",
+            "weight_kg": weight_kg.strip() if weight_kg else ""
+        }
+
+    def clean_alias(self, alias):
+        alias = re.sub(r'\(Age \d+\)', '', alias, re.IGNORECASE)
+        return [name.strip() for name in alias.split(',') if name.strip()]
+
+    def parse_numeric(self, value):
+        try:
+            return float(value)
+        except (ValueError, TypeError):
+            return 0
--- a/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
+++ b/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
@ -0,0 +1,636 @@
+
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+#import config
+#import utils
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+
+astr_base_url = f"{host_url}/astrology.rme/sign="
+astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
+
+birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
+
+distributors_list_url = f'{host_url}/distrib.asp'
+distributors_base_url = f"{host_url}/distrib.rme/distrib="
+
+studios_list_url = f"{host_url}/studio.asp"
+studios_base_url = f"{host_url}/studio.rme/studio="
+
+ethnic_list_url = f'{host_url}/advsearch.asp'
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+http_code_404   = 404
+http_code_login = 401
+http_code_url = 601
+http_code_local = 99
+
+save_raw_html = True
+load_from_local = False
+
+def common_parser(html, page, **kwargs):
+    parser = "lxml" if page=='ethnic' else "html.parser" 
+    soup = BeautifulSoup(html, parser)
+    if not soup:
+        return None
+    if page == 'astro':
+        #parse_page_astro(soup, astro):
+        return parse_page_astro(soup, **kwargs)
+    elif page == 'birth':
+        #parse_page_birth(soup, month, day):
+        return parse_page_birth(soup, **kwargs)
+    elif page == 'ethnic':
+        #parse_page_ethnic(soup, ethnic):
+        return parse_page_ethnic(soup, **kwargs)
+    elif page == 'dist':
+        return parse_page_dist_stu(soup,'distable')
+    elif page == 'stu':
+        return parse_page_dist_stu(soup,'studio')
+    elif page == 'actor':
+        #parse_page_performer(soup, url):
+        return parse_page_performer(soup, **kwargs)
+    elif page == 'movies':
+        #parse_page_movie(soup, href, title)
+        return parse_page_movie(soup, **kwargs)
+    else:
+        logging.warning(f"wrong page: {page}")
+        return None
+
+'''
+#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
+def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+    if load_from_local:     # 从本地读取的逻辑
+        html = utils.read_raw_html(url)
+        if html:
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(html) if preprocessor else html
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                return soup, http_code_local     # 返回一个小于100的错误码，表明是从本地返回的
+
+    for attempt in range(max_retries):
+        try:
+            if host_url not in url.lower():
+                logging.error(f'wrong url format: {url}')
+                return None, http_code_url
+            
+            response = scraper.get(url, headers=headers)
+
+            # 处理 HTTP 状态码
+            if response.status_code == 404:
+                logging.debug(f"Page not found (404): {url}")
+                return None, http_code_404  # 直接返回 404，调用方可以跳过
+            
+            response.raise_for_status()  # 处理 HTTP 错误
+                    
+            # 过期的网页，与404相同处理
+            if "invalid or outdated page" in response.text.lower():
+                logging.debug(f"invalid or outdated page: {url}")
+                return None, http_code_404  # 直接返回 404，调用方可以跳过                
+
+            if save_raw_html:
+                utils.write_raw_html(url, response.text)
+
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(response.text) if preprocessor else response.text
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                return soup, response.status_code
+            else:
+                # 检查是否发生跳转，比如到登录页面
+                if response.history:
+                    logging.warning(f"Page redirected on {url}. Validation failed.")
+                    return None, http_code_login
+
+            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+        except cloudscraper.exceptions.CloudflareChallengeError as e:
+            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+        except cloudscraper.exceptions.CloudflareCode1020 as e:
+            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+        except Exception as e:
+            logging.error(f"Unexpected error on {url}: {e}, Retring...")
+
+    logging.error(f'Fetching failed after max retries. {url}')
+    return None, None  # 达到最大重试次数仍然失败
+'''
+
+# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
+def preprocess_html(html):
+    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
+
+# 通用的 HTML 结构验证器
+def generic_validator(soup, tag, identifier, attr_type="id"):
+    if attr_type == "id":
+        return soup.find(tag, id=identifier) is not None
+    elif attr_type == "class":
+        return bool(soup.find_all(tag, class_=identifier))
+    elif attr_type == "name": 
+        return bool(soup.find('select', {'name': identifier}))
+    return False
+
+# 检查电影信息是否存在
+def movie_validator(soup, table_id):
+    return soup.find("table", id=table_id) is not None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_ethnic_list(soup, href):
+    div_root = soup.find("select", id="ethnicity1")
+    if not div_root:
+        logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
+        return None, None
+    
+    list_data = []
+
+    # 提取所有的 <option> 标签
+    options = div_root.find_all('option')
+    if options:
+        # 解析并输出 value 和文本内容
+        for option in options:
+            href = option.get('value', None)
+            text = option.text.strip()
+            if href and href.lower() == 'none':
+                continue
+            list_data.append({
+                "name": text,
+                "href": host_url + href if href else '' 
+            })
+    return list_data
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_astro(soup, astro):
+    astro_div = soup.find("div", id="astro")
+    if not astro_div:
+        logging.warning(f"Warning: No 'astro' div found in {astro}")
+        return None, None
+    
+    flag = False
+    list_cnt = 0
+    list_data = []
+    next_url = None
+
+    birth_date = None
+    for elem in astro_div.find_all(recursive=False):
+        if elem.name == "h3" and "astroday" in elem.get("class", []):
+            birth_date = elem.get_text(strip=True)
+        elif elem.name == "div" and "perficon" in elem.get("class", []):
+            a_tag = elem.find("a")
+            if a_tag:
+                href = host_url + a_tag["href"]
+                name = a_tag.find("span", class_="perfname")
+                if name:
+                    list_data.append({
+                        "astrology": astro,
+                        "birth_date": birth_date,
+                        "person": name.get_text(strip=True),
+                        "href": href
+                    })
+                    flag = True
+                    list_cnt = list_cnt +1
+    if flag:
+        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
+        return list_data, next_url
+    else:
+        return None, None
+
+
+# 解析页面内容并更新birth_map
+def parse_page_birth(soup, month, day):
+    datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
+    if not datarows:
+        return None, None
+    
+    flag = False
+    list_cnt = 0
+    list_data = []
+    next_url = None
+    rows = datarows[0].find_all('div', class_='col-sm-4')
+    for row in rows:
+        link_tag = row.find('a')
+        person = link_tag.text.strip() if link_tag else ''
+        href = link_tag['href'] if link_tag else ''
+        href = host_url + href
+        
+        # 如果 href 已经在 birth_map 中，跳过
+        flag = True
+        if any(entry['href'] == href for entry in list_data):
+            continue
+        
+        # 将数据添加到 birth_map
+        list_data.append({
+            'month': month,
+            'day': day,
+            'person': person,
+            'href': href
+        })
+        list_cnt = list_cnt +1
+
+    if flag:
+        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
+        return list_data, next_url
+    else:
+        return None, None
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_ethnic(soup, ethnic):
+    rows = soup.find_all('div', class_='row headshotrow')
+    flag = False
+    list_data = []
+    next_url = None
+
+    for row in rows:
+        for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
+            link_tag = col.find('a')
+            img_tag = col.find('div', class_='pictag')
+            flag = True
+
+            if link_tag and img_tag:
+                href = host_url + link_tag['href']
+                person = img_tag.text.strip()
+
+                # 将数据存储到 ethnic_map
+                list_data.append({
+                    'ethnic': ethnic,
+                    'person': person,
+                    'href': href
+                })
+    if flag:
+        logging.debug(f"get {len(list_data)} persons from this page.")
+
+        next_page = soup.find('a', rel='next')
+        if next_page:
+            next_url = host_url + next_page['href']
+            logging.debug(f"Found next page: {next_url}")
+            return list_data, next_url
+        else:
+            logging.debug(f"All pages fetched for {ethnic}.")
+            return list_data, None
+    else:
+        return None, None
+
+# 解析列表页
+def parse_page_dist_stu_list(soup, select_name):
+    list_data = []
+    next_url = None
+
+    select_element = soup.find('select', {'name': select_name})
+    if select_element :    
+        options = select_element.find_all('option')
+        for option in options:
+            value = option.get('value')  # 获取 value 属性
+            text = option.text.strip()   # 获取文本内容
+            list_data.append({
+                'name' : text,
+                'href' : str(value)
+            })
+        return list_data, next_url
+    else:
+        return None, None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_dist_stu(soup, table_id):
+    table = soup.find("table", id=table_id)
+    if not table:
+        logging.warning(f"Warning: No {table_id} table found ")
+        return None, None
+    
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 5:
+            title = cols[0].text.strip()
+            label = cols[1].text.strip()
+            year = cols[2].text.strip()
+            rev = cols[3].text.strip()
+            a_href = cols[0].find('a')
+            href = host_url + a_href['href'] if a_href else ''
+
+            list_data.append({
+                'title': title,
+                'label': label,
+                'year': year,
+                'rev': rev,
+                'href': href
+            })
+    return list_data, next_url
+
+
+# 解析 作品列表，有个人出演，也有导演的
+def parse_credits_table(table, distributor_list):
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    movies = []
+    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+
+    # rows = table.find_all('tr', class_='we')
+    for row in rows:
+        #tr_class = row.get('class', '')  # 获取 class 属性，如果没有则返回空字符串
+        tr_class = ' '.join(row.get('class', []))  # 获取 class 属性，如果没有则返回空字符串
+        cols = row.find_all('td')
+        if len(cols) >= 6:
+            title = cols[0].text.strip()
+            href_a = cols[0].find('a') 
+            href = href_a['href'] if href_a else ''
+            year = cols[1].text.strip()
+            distributor = cols[2].text.strip().lower()
+            href_d = cols[2].find('a') 
+            href_dist = host_url + href_d['href'] if href_d else ''
+            notes = cols[3].text.strip()
+            rev = cols[4].text.strip()
+            formats = cols[5].text.strip()
+
+            for key in distributor_list:
+                if key in distributor:
+                    distributor_count[key] += 1
+
+            movies.append({
+                'title': title,
+                'href' : href,
+                'year': year,
+                'distributor': distributor,
+                'distributor_href': href_dist,
+                'notes': notes,
+                'rev': rev,
+                'formats': formats,
+                'tr_class': tr_class
+            })
+    return movies, distributor_count
+
+
+# 请求网页并提取所需数据
+def parse_page_performer(soup, url):
+    # 提取数据
+    data = {}
+
+    # 定义我们需要的字段名称和HTML中对应的标签
+    fields = {
+        'performer_aka': 'Performer AKA',
+        'birthday': 'Birthday',
+        'astrology': 'Astrology',
+        'birthplace': 'Birthplace',
+        'gender': 'Gender',
+        'years_active': 'Years Active',
+        'ethnicity': 'Ethnicity',
+        'nationality': 'Nationality',
+        'hair_colors': 'Hair Colors',
+        'eye_color': 'Eye Color',
+        'height': 'Height',
+        'weight': 'Weight',
+        'measurements': 'Measurements',
+        'tattoos': 'Tattoos',
+        'piercings': 'Piercings'
+    }
+    reversed_map = {v: k for k, v in fields.items()}
+
+    # 解析表格数据, 获取参演或者导演的列表
+    role_list = ['personal', 'directoral']
+    distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']        
+    credits_list = {}
+
+    # 使用字典来存储统计
+    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+    for role in role_list:
+        table = soup.find('table', id=role)
+        if table :
+            movies, stat_map = parse_credits_table(table, distributor_list)
+            credits_list[role] = movies
+            # 更新 distributor 统计
+            for distributor in distributor_list:
+                distributor_count[distributor] += stat_map.get(distributor, 0)
+
+    # 统计 movies 数量
+    #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
+    movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
+
+    # 如果没有找到
+    if len(credits_list) == 0 :
+        logging.warning(f"movie table empty. url: {url} ")
+
+    # 遍历每个 bioheading, 获取metadata
+    bioheadings = soup.find_all('p', class_='bioheading')
+    for bio in bioheadings:
+        heading = bio.text.strip()
+        biodata = None
+
+        # 如果包含 "Performer",需要特殊处理
+        if 'Performer' in heading:
+            heading = 'Performer AKA'
+            biodata_div = bio.find_next('div', class_='biodata')
+            if biodata_div:
+                div_text = biodata_div.get_text(separator='|').strip()
+                biodata = [b.strip() for b in div_text.split('|') if b.strip()]
+        else:
+            biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
+        
+        # 保存数据
+        if heading in reversed_map:
+            kkey = reversed_map[heading]
+            data[kkey] = biodata
+            
+    # 添加统计数据到 data
+    data['movies_cnt'] = movies_cnt
+    data['vixen_cnt'] = distributor_count['vixen']
+    data['blacked_cnt'] = distributor_count['blacked']
+    data['tushy_cnt'] = distributor_count['tushy']
+    data['x_art_cnt'] = distributor_count['x-art']
+    data['credits'] = credits_list
+
+    return data
+
+
+
+# 解析网页 HTML 并提取电影信息
+def parse_page_movie(soup, href, title):
+    # 解析电影基础信息
+    movie_data = {}
+    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
+    if info_div:
+        labels = info_div.find_all("p", class_="bioheading")
+        values = info_div.find_all("p", class_="biodata")
+        for label, value in zip(labels, values):
+            key = label.text.strip()
+            if key == "Directors":  # 解析多位导演的情况
+                directors = []
+                links = value.find_all("a")
+                for link in links:
+                    director_name = link.text.strip()
+                    director_href = host_url + link['href'] if link['href'] else ''
+                    directors.append({"name": director_name, "href": director_href})
+                movie_data[key] = directors
+            else:
+                val = value.text.strip()
+                if key in ["Distributor", "Studio", "Director"]:
+                    link = value.find("a")
+                    if link:
+                        val = link.text.strip()
+                        movie_data[f'{key}Href'] = host_url + link['href']
+                movie_data[key] = val
+    else:
+        return None
+
+    # 解析演职人员信息
+    performers = []
+    cast_divs = soup.find_all("div", class_="castbox")
+    for cast in cast_divs:
+        performer = {}
+        link = cast.find("a")
+        if link:
+            performer["name"] = link.text.strip()
+            performer["href"] =  host_url + link["href"]
+
+        #performer["tags"] = [
+        #    tag.strip() for br in cast.find_all("br")
+        #    if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
+        #]
+
+        tags = []
+        for br in cast.find_all("br"):
+            tag = br.next_sibling
+            if isinstance(tag, str) and tag.strip():
+                tags.append(tag.strip())
+        performer["tags"] = tags
+
+        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
+        performers.append(performer)
+
+    # 解析场景拆解
+    scene_breakdowns = []
+    scene_table = soup.find("div", id="sceneinfo")
+    if scene_table:
+        rows = scene_table.find_all("tr")
+
+        for row in rows:
+            cols = row.find_all("td")
+            if len(cols) >= 2:
+                scene = cols[0].text.strip()  # 场景编号
+                performer_info = cols[1]  # 包含表演者及链接信息
+
+                # 获取 <br> 之前的完整 HTML（保留 <i> 标签等格式）
+                performer_html = str(performer_info)  # 获取所有HTML内容
+                split_html = performer_html.split("<br/>")  # 按 <br> 进行分割
+                if split_html:
+                    performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                else:
+                    split_html = performer_html.split("<br>")  # 按 <br> 进行分割
+                    if split_html:
+                        performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                    else:
+                        performers_html = performer_html.strip()  # 如果没有 <br>，取全部
+
+                # 解析为纯文本（去除HTML标签，仅提取文本内容）
+                performers_soup = BeautifulSoup(performers_html, "html.parser")
+                performers_text = performers_soup.get_text()
+
+                # 提取表演者
+                scene_performers = [p.strip() for p in performers_text.split(",")]
+
+                # 尝试获取 `webscene` 和 `studio`
+                links_data = {}
+                links = performer_info.find_all("a")
+                if links:
+                    webscene_title = links[0].text.strip() if len(links)>0 else None
+                    webscene = links[0]["href"] if len(links)>0 else None
+                    studio = links[1].text.strip() if len(links)>1 else None
+                    studio_lnk = links[1]["href"] if len(links)>1 else None
+                    links_data = {
+                        "title": webscene_title,
+                        "webscene": webscene,
+                        "studio": studio,
+                        "studio_lnk": studio_lnk,
+                    }
+
+                scene_data = {
+                    "scene": scene,
+                    "performers": scene_performers,
+                    **links_data,
+                }
+                scene_breakdowns.append(scene_data)
+
+    appears_in = []
+    appears_divs = soup.find("div", id="appearssection")
+    if appears_divs:
+        rows = appears_divs.find_all("li")
+        for row in rows:
+            lnk = row.find("a")
+            if lnk:
+                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
+
+
+    return {
+        "href": href,
+        "title": title,
+        "Minutes": movie_data.get("Minutes", ""),
+        "Distributor": movie_data.get("Distributor", ""),
+        "Studio": movie_data.get("Studio", ""),
+        "ReleaseDate": movie_data.get("Release Date", ""),
+        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
+        "All-Girl": movie_data.get("All-Girl", ""),
+        "All-Male": movie_data.get("All-Male", ""),
+        "Compilation": movie_data.get("Compilation", ""),
+        "Webscene": movie_data.get("Webscene", ""),
+        "Director": movie_data.get("Director", ""),
+        "DirectorHref": movie_data.get("DirectorHref", ""),
+        "DistributorHref": movie_data.get("DistributorHref", ""),
+        "StudioHref": movie_data.get("StudioHref", ""),
+        "Directors": movie_data.get("Directors", []),   # 可能存在的元素
+        "Performers": performers,
+        "SceneBreakdowns": scene_breakdowns,
+        "AppearsIn": appears_in,
+    }
+
+
+if __name__ == "__main__":
+
+    for astro in astro_list:
+        url = astr_base_url + astro
+        next_url = url
+        logging.info(f"Fetching data for {astro}, url {url} ...")
+
+        while True:
+            soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
+            if soup:
+                list_data, next_url = parse_page_astro(soup, astro)
+                if list_data:
+                    print(list_data[0] if len(list_data)>0 else 'no data')
+                    break
+            else:
+                logging.info(f"Retrying {next_url} ...")
+                time.sleep(5)  # 等待后再重试
+
+        time.sleep(2)  # 控制访问频率
--- a/scrapy_proj/scrapy_proj/utils/utils.py
+++ b/scrapy_proj/scrapy_proj/utils/utils.py
@ -129,3 +129,11 @@ def replace_lang_param(url: str) -> str:
    )
    return urlunparse(new_parsed)

+def pretty_json_simple(item):
+    try:
+        # 转换为单行JSON格式，需要保证传入的是map，不能是list
+        return json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
+    except:
+        # 转换失败时返回原始字符串
+        return item
+