modify scripts

2025-07-03 11:41:30 +08:00
parent 7a46b1bc4c
commit ff49046212
7 changed files with 608 additions and 128 deletions
--- a/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py
@ -0,0 +1,104 @@
 import os
 import sqlite3
 import logging
 from datetime import datetime
 from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path
 class IAFDQuery(SQLiteDBHandler):
    def __init__(self, db_path=shared_db_path):
        super().__init__(db_path)
        self.tbl_name_performers = 'iafd_performers'
        self.tbl_name_movies = 'iafd_movies'
        self.uniq_key = 'href'
    # 按条件查询 href 列表 
    def get_performers(self, **filters):
        try:
            sql = f"SELECT href, name, id, movies_cnt FROM {self.tbl_name_performers} WHERE 1=1"
            params = []
            conditions = {
                "id": " AND id = ?",
                "href": " AND href = ?",
                "name": " AND name LIKE ?",
                "is_full_data": " AND is_full_data = ?",
                "start_id": " AND id > ?",
            }
            for key, condition in conditions.items():
                if key in filters:
                    sql += condition
                    if key == "name":
                        params.append(f"%{filters[key]}%")
                    else:
                        params.append(filters[key])
            for key in ["is_full_data_in", "is_full_data_not_in"]:
                if key in filters:
                    values = filters[key]
                    if values:
                        placeholders = ", ".join(["?"] * len(values))
                        operator = "IN" if key == "is_full_data_in" else "NOT IN"
                        sql += f" AND is_full_data {operator} ({placeholders})"
                        params.extend(values)
            if "order_by" in filters:
                # 注意：这里 order by 后面直接跟字段名，不能用占位符，否则会被当作字符串处理
                sql += f" ORDER BY {filters['order_by']} "
            if 'limit' in filters:
                sql += " LIMIT ?"
                params.append(filters["limit"])
            self.cursor.execute(sql, params)
            return [dict(row) for row in self.cursor.fetchall()]
        except sqlite3.Error as e:
            logging.error(f"查询 href 失败: {e}")
            return None
    # 按条件查询 href 列表 
    def get_movies(self, **filters):
        try:
            sql = f"SELECT href, title, id FROM {self.tbl_name_performers} WHERE 1=1"
            params = []
            conditions = {
                "id": " AND id = ?",
                "href": " AND href = ?",
                "title": " AND title LIKE ?",
                "is_full_data": " AND is_full_data = ?",
                "start_id": " AND id > ?",
            }
            for key, condition in conditions.items():
                if key in filters:
                    sql += condition
                    if key == "name":
                        params.append(f"%{filters[key]}%")
                    else:
                        params.append(filters[key])
            for key in ["is_full_data_in", "is_full_data_not_in"]:
                if key in filters:
                    values = filters[key]
                    if values:
                        placeholders = ", ".join(["?"] * len(values))
                        operator = "IN" if key == "is_full_data_in" else "NOT IN"
                        sql += f" AND is_full_data {operator} ({placeholders})"
                        params.extend(values)
            if "order_by" in filters:
                # 注意：这里 order by 后面直接跟字段名，不能用占位符，否则会被当作字符串处理
                sql += f" ORDER BY {filters['order_by']} "
            if 'limit' in filters:
                sql += " LIMIT ?"
                params.append(filters["limit"])
            self.cursor.execute(sql, params)
            return [dict(row) for row in self.cursor.fetchall()]
        except sqlite3.Error as e:
            logging.error(f"查询 href 失败: {e}")
            return None
--- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
@ -0,0 +1,134 @@
 import os
 import sqlite3
 import logging
 from datetime import datetime
 home_dir = os.path.expanduser("~")
 global_share_data_dir = f'{home_dir}/sharedata'
 default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
 shared_db_path = f"{global_share_data_dir}/sqlite/shared.db"
 # 数据库基类，封装了通用的操作。
 class SQLiteDBHandler:
    def __init__(self, db_path=None):
        # 使用传入的 db_path 或默认路径
        self.DB_PATH = db_path or default_dbpath
        # 验证路径是否存在（可选）
        if db_path and not os.path.exists(os.path.dirname(db_path)):
            os.makedirs(os.path.dirname(db_path))
        self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
        self.conn.execute('PRAGMA journal_mode = WAL')  # 启用 WAL(Write-Ahead Logging) 模式
        self.conn.commit()
        self.conn.row_factory = sqlite3.Row # 结果集支持字典式访问
        self.cursor = self.conn.cursor()
        # 检查 SQLite 版本
        self.lower_sqlite_version = False
        sqlite_version = sqlite3.sqlite_version_info
        if sqlite_version < (3, 24, 0):
            self.lower_sqlite_version = True
    def get_table_columns_and_defaults(self, tbl_name):
        try:
            self.cursor.execute(f"PRAGMA table_info({tbl_name})")
            columns = self.cursor.fetchall()
            column_info = {}
            for col in columns:
                col_name = col[1]
                default_value = col[4]
                column_info[col_name] = default_value
            return column_info
        except sqlite3.Error as e:
            logging.error(f"Error getting table columns: {e}")
            return None
    def check_and_process_data(self, data, tbl_name):
        column_info = self.get_table_columns_and_defaults(tbl_name)
        if column_info is None:
            return None
        processed_data = {}
        for col, default in column_info.items():
            if col == 'id' or col == 'created_at':  # 自增主键，不需要用户提供; 创建日期，使用建表默认值
                continue
            if col == 'updated_at':  # 日期函数，用户自己指定即可
                processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            if col in data:
                processed_data[col] = data[col]
        return processed_data
    def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
        if self.lower_sqlite_version:
            return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
        try:
            processed_data = self.check_and_process_data(data, tbl_name)
            if processed_data is None:
                return None
            columns = ', '.join(processed_data.keys())
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
            update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
            sql = f'''
                INSERT INTO {tbl_name} ({columns})
                VALUES ({placeholders})
                ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
            '''
            self.cursor.execute(sql, values)
            self.conn.commit()
            # 获取插入或更新后的记录 ID
            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
            record_id = self.cursor.fetchone()[0]
            return record_id
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
        try:
            processed_data = self.check_and_process_data(data, tbl_name)
            if processed_data is None:
                return None
            columns = ', '.join(processed_data.keys())
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
            # 先尝试插入数据
            try:
                sql = f'''
                    INSERT INTO {tbl_name} ({columns})
                    VALUES ({placeholders})
                '''
                self.cursor.execute(sql, values)
                self.conn.commit()
            except sqlite3.IntegrityError:  # 唯一键冲突，执行更新操作
                update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
                update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
                update_values.append(data[uniq_key])
                sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
                self.cursor.execute(sql, update_values)
                self.conn.commit()
            # 获取插入或更新后的记录 ID
            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
            record_id = self.cursor.fetchone()[0]
            return record_id
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def get_id_by_key(self, tbl, uniq_key, val):
        self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
        row = self.cursor.fetchone()
        return row[0] if row else None
    def close(self):
        self.cursor.close()
        self.conn.close()
--- a/scrapy_proj/scrapy_proj/items.py
+++ b/scrapy_proj/scrapy_proj/items.py
@ -19,4 +19,49 @@ class U001Item(scrapy.Item):
 class Sis001Item(scrapy.Item):
    title = scrapy.Field()
    url = scrapy.Field()
-    plate_name = scrapy.Field()
+    plate_name = scrapy.Field()
 class IAFDPersonItem(scrapy.Item):
    name = scrapy.Field()
    href = scrapy.Field()
    from_astro_list = scrapy.Field()
    from_birth_list = scrapy.Field()
    from_ethnic_list = scrapy.Field()
    from_movie_list = scrapy.Field()
 class IAFDMovieItem(scrapy.Item):
    title = scrapy.Field()
    href = scrapy.Field()
    release_year = scrapy.Field()
    from_performer_list = scrapy.Field()
    from_dist_list = scrapy.Field()
    from_stu_list = scrapy.Field()
 class IAFDPersonDetailItem(scrapy.Item):
    href = scrapy.Field()
    person = scrapy.Field()
    gender = scrapy.Field()
    birthday = scrapy.Field()
    astrology = scrapy.Field()
    birthplace = scrapy.Field()
    years_active = scrapy.Field()
    ethnicity = scrapy.Field()
    nationality = scrapy.Field()
    hair_colors = scrapy.Field()
    eye_color = scrapy.Field()
    height = scrapy.Field()
    weight = scrapy.Field()
    measurements = scrapy.Field()
    tattoos = scrapy.Field()
    piercings = scrapy.Field()
    movies_cnt = scrapy.Field()
    vixen_cnt = scrapy.Field()
    blacked_cnt = scrapy.Field()
    tushy_cnt = scrapy.Field()
    x_art_cnt = scrapy.Field()
    performer_aka = scrapy.Field()
 class IAFDMovieDetailItem(scrapy.Item):
    title = scrapy.Field()
    href = scrapy.Field()
    # 可以根据实际需求添加更多影片详情字段
--- a/scrapy_proj/scrapy_proj/middlewares.py
+++ b/scrapy_proj/scrapy_proj/middlewares.py
@ -98,3 +98,68 @@ class ScrapyProjDownloaderMiddleware:
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
 import cloudscraper
 from scrapy.http import TextResponse
 import datetime
 #  使用cloudscraper做代理，去请求网站
 class CloudScraperMiddleware:
    def __init__(self, stats):
        self.scraper = cloudscraper.create_scraper()
        self.stats = stats  # 注入统计对象
        # 指定需要使用 cloudscraper 的域名
        self.target_domains = {'iafd.com', 'another-domain.com'}
        # 设置 headers 和 scraper
        self.ifad_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            stats=crawler.stats  # 获取Scrapy统计对象
        )
    def process_request(self, request, spider):
        # 记录请求开始时间
        start_time = datetime.datetime.now()
        try:
            # 发送请求
            response = self.scraper.get(
                request.url,
                headers=self.ifad_headers,
                cookies=request.cookies
            )
            # 计算请求耗时（毫秒）
            duration = (datetime.datetime.now() - start_time).total_seconds() * 1000
            # 更新统计数据
            self.stats.inc_value('downloader/request_count')
            self.stats.inc_value('downloader/request_method_count/GET')
            self.stats.inc_value('downloader/request_bytes', len(str(request.headers)) + len(request.url))
            self.stats.inc_value('downloader/response_count')
            self.stats.inc_value(f'downloader/response_status_count/{response.status_code}')
            self.stats.inc_value('downloader/response_bytes', len(response.content))
            self.stats.set_value(f'response_received_count', self.stats.get_value('downloader/response_status_count/200', 0))
            # 创建Scrapy响应对象
            return TextResponse(
                url=response.url,
                status=response.status_code,
                body=response.content,
                encoding=response.encoding,
                request=request
            )
        except Exception as e:
            # 记录错误
            self.stats.inc_value('downloader/exception_count')
            self.stats.inc_value(f'downloader/exception_type_count/{e.__class__.__name__}')
            spider.logger.error(f"CloudScraper请求失败: {e}")
            return None  # 失败时使用默认下载器
--- a/scrapy_proj/scrapy_proj/pipelines.py
+++ b/scrapy_proj/scrapy_proj/pipelines.py
@ -15,132 +15,8 @@ import os
 import sqlite3
 import logging
 from datetime import datetime
-from scrapy_proj.items import U001Item, Sis001Item
+from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
-
+from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler
 home_dir = os.path.expanduser("~")
 global_share_data_dir = f'{home_dir}/sharedata'
 default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
 # 数据库基类，封装了通用的操作。
 class SQLiteDBHandler:
    def __init__(self, db_path=None):
        # 使用传入的 db_path 或默认路径
        self.DB_PATH = db_path or default_dbpath
        # 验证路径是否存在（可选）
        if db_path and not os.path.exists(os.path.dirname(db_path)):
            os.makedirs(os.path.dirname(db_path))
        self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
        self.cursor = self.conn.cursor()
        # 检查 SQLite 版本
        self.lower_sqlite_version = False
        sqlite_version = sqlite3.sqlite_version_info
        if sqlite_version < (3, 24, 0):
            self.lower_sqlite_version = True
    def get_table_columns_and_defaults(self, tbl_name):
        try:
            self.cursor.execute(f"PRAGMA table_info({tbl_name})")
            columns = self.cursor.fetchall()
            column_info = {}
            for col in columns:
                col_name = col[1]
                default_value = col[4]
                column_info[col_name] = default_value
            return column_info
        except sqlite3.Error as e:
            logging.error(f"Error getting table columns: {e}")
            return None
    def check_and_process_data(self, data, tbl_name):
        column_info = self.get_table_columns_and_defaults(tbl_name)
        if column_info is None:
            return None
        processed_data = {}
        for col, default in column_info.items():
            if col == 'id' or col == 'created_at':  # 自增主键，不需要用户提供; 创建日期，使用建表默认值
                continue
            if col == 'updated_at':  # 日期函数，用户自己指定即可
                processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            if col in data:
                processed_data[col] = data[col]
        return processed_data
    def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
        if self.lower_sqlite_version:
            return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
        try:
            processed_data = self.check_and_process_data(data, tbl_name)
            if processed_data is None:
                return None
            columns = ', '.join(processed_data.keys())
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
            update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
            sql = f'''
                INSERT INTO {tbl_name} ({columns})
                VALUES ({placeholders})
                ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
            '''
            self.cursor.execute(sql, values)
            self.conn.commit()
            # 获取插入或更新后的记录 ID
            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
            record_id = self.cursor.fetchone()[0]
            return record_id
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
        try:
            processed_data = self.check_and_process_data(data, tbl_name)
            if processed_data is None:
                return None
            columns = ', '.join(processed_data.keys())
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
            # 先尝试插入数据
            try:
                sql = f'''
                    INSERT INTO {tbl_name} ({columns})
                    VALUES ({placeholders})
                '''
                self.cursor.execute(sql, values)
                self.conn.commit()
            except sqlite3.IntegrityError:  # 唯一键冲突，执行更新操作
                update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
                update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
                update_values.append(data[uniq_key])
                sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
                self.cursor.execute(sql, update_values)
                self.conn.commit()
            # 获取插入或更新后的记录 ID
            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
            record_id = self.cursor.fetchone()[0]
            return record_id
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def get_id_by_key(self, tbl, uniq_key, val):
        self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
        row = self.cursor.fetchone()
        return row[0] if row else None
    def close(self):
        self.cursor.close()
        self.conn.close()
 class SQLitePipeline(SQLiteDBHandler):
    def __init__(self, db_path=None):
@ -188,6 +64,14 @@ class SQLitePipeline(SQLiteDBHandler):
            self._process_u001_item(item)
        elif isinstance(item, Sis001Item):
            self._process_sis001_item(item)
        elif isinstance(item, IAFDPersonItem):
            self._process_iafd_person_item(item)
        elif isinstance(item, IAFDPersonDetailItem):
            self._process_iafd_person_detail_item(item)
        elif isinstance(item, IAFDMovieItem):
            self._process_iafd_movie_item(item)
        elif isinstance(item, IAFDMovieDetailItem):
            self._process_iafd_movie_detail_item(item)
        return item
    def _process_u001_item(self, item):
@ -205,5 +89,17 @@ class SQLitePipeline(SQLiteDBHandler):
        ))
        self.conn.commit()
    def _process_iafd_person_item(self, item):
        logging.info(f"deal with persion item. {item}")
    def _process_iafd_movie_item(self, item):
        logging.info(f"deal with movie item. {item}")
    def _process_iafd_person_detail_item(self, item):
        logging.info(f"deal with persion item. {item}")
    def _process_iafd_movie_detail_item(self, item):
        logging.info(f"deal with movie item. {item}")
    def close_spider(self, spider):
        self.conn.close()
--- a/scrapy_proj/scrapy_proj/settings.py
+++ b/scrapy_proj/scrapy_proj/settings.py
@ -30,6 +30,7 @@ ADDONS = {}
 # 并发设置
 CONCURRENT_REQUESTS = 1
 CONCURRENT_REQUESTS_PER_DOMAIN = 1
 CONCURRENT_ITEMS = 100
 # 下载延迟
@ -51,6 +52,7 @@ USER_AGENT_LIST = [
 DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': None,
    'scrapy_proj.middlewares.CloudScraperMiddleware': 543,
 }
 # settings.py
@ -66,7 +68,7 @@ STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensio
 #USER_AGENT = "scrapy_proj (+http://www.yourdomain.com)"
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+#ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
--- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
@ -0,0 +1,234 @@
 import scrapy
 import re
 import logging
 from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
 from scrapy_proj.db_wapper.iafd_query import IAFDQuery
 db_tools = IAFDQuery()
 class IAFDSpider(scrapy.Spider):
    name = "iafd"
    allowed_domains = ["iafd.com"]
    host_url = "https://www.iafd.com"
    astr_base_url = f"{host_url}/astrology.rme/sign="
    astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
    birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
    distributors_list_url = f'{host_url}/distrib.asp'
    studios_list_url = f"{host_url}/studio.asp"
    ethnic_list_url = f'{host_url}/advsearch.asp'
    def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
        self.cmd_list = cmd
        self.update = int(update)
    def start_requests(self):
        # 按星座获取演员列表
        for astro in self.astro_list:
            url = self.astr_base_url + astro
            yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
            if self.debug:
                break
        # 按生日获取演员列表
        for month in range(1, 13):
            for day in range(1, 32):
                url = self.birth_base_url.format(month=month, day=day)
                yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
                if self.debug:
                    break
        # 获取人种列表
        yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
        # 获取 distributors 列表
        yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
        # 获取 studios 列表
        yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
        query_args = {}
        if self.debug:
            query_args['limit'] = 5
        if self.update == 0:
            query_args['is_full_data'] = 0
        # 读取待更新的演员列表
        actors = db_tools.get_performers(**query_args)
        if actors:
            for item in actors:
                href = item.get('href', '')
                movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
                logging.info(f"fetch from db. item: {item}")
                yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
        # 读取待更新的影片列表
        movies = db_tools.get_movies(**query_args)
        if movies:
            for item in movies:
                href = item.get('href', '')
                logging.info(f"fetch from db. item: {item}")
                yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
    async def start(self):
        # 调用原有 start_requests 方法
        async for request in super().start():
            yield request
    def parse_astro_page(self, response):
        astro = response.meta['astro']
        astro_div = response.css('div#astro')
        if astro_div:
            birth_date = None
            for elem in astro_div.css('*'):
                if elem.css('h3.astroday'):
                    birth_date = elem.css('h3.astroday::text').get().strip()
                elif elem.css('div.perficon'):
                    a_tag = elem.css('a')
                    if a_tag:
                        href = self.host_url + a_tag.attrib['href']
                        name = a_tag.css('span.perfname::text').get()
                        if name:
                            item = IAFDPersonItem()
                            item['name'] = name
                            item['href'] = href
                            item['from_astro_list'] = 1
                            item['from_birth_list'] = 0
                            item['from_ethnic_list'] = 0
                            item['from_movie_list'] = 0
                            yield item
                            #yield scrapy.Request(href, callback=self.parse_person_detail_page)
    def parse_birth_page(self, response):
        month = response.meta['month']
        day = response.meta['day']
        datarows = response.css('div.col-sm-12.col-lg-9')
        if datarows:
            rows = datarows[0].css('div.col-sm-4')
            for row in rows:
                link_tag = row.css('a')
                person = link_tag.css('::text').get().strip() if link_tag else ''
                href = self.host_url + link_tag.attrib['href'] if link_tag else ''
                item = IAFDPersonItem()
                item['name'] = person
                item['href'] = href
                item['from_astro_list'] = 0
                item['from_birth_list'] = 1
                item['from_ethnic_list'] = 0
                item['from_movie_list'] = 0
                yield item
                #yield scrapy.Request(href, callback=self.parse_person_detail_page)
    def parse_ethnic_list_page(self, response):
        div_root = response.css('select#ethnicity1')
        if div_root:
            options = div_root.css('option')
            for option in options:
                href = option.attrib.get('value')
                text = option.css('::text').get().strip()
                if href and href.lower() != 'none':
                    ethnic_url = self.host_url + href
                    yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
                    if self.debug:
                        break
    def parse_ethnic_page(self, response):
        ethnic = response.meta['ethnic']
        rows = response.css('div.row.headshotrow')
        for row in rows:
            cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
            for col in cols:
                link_tag = col.css('a')
                img_tag = col.css('div.pictag')
                if link_tag and img_tag:
                    href = self.host_url + link_tag.attrib['href']
                    person = img_tag.css('::text').get().strip()
                    item = IAFDPersonItem()
                    item['name'] = person
                    item['href'] = href
                    item['from_astro_list'] = 0
                    item['from_birth_list'] = 0
                    item['from_ethnic_list'] = 1
                    item['from_movie_list'] = 0
                    yield item
                    #yield scrapy.Request(href, callback=self.parse_person_detail_page)
        next_page = response.css('a[rel="next"]')
        if next_page:
            next_url = self.host_url + next_page.attrib['href']
            yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
    def parse_distributors_list_page(self, response):
        select_element = response.css('select[name="Distrib"]')
        if select_element:
            options = select_element.css('option')
            for option in options:
                value = option.attrib.get('value')
                text = option.css('::text').get().strip()
                dis_url = self.host_url + f"/distrib.rme/distrib={value}"
                item = IAFDMovieItem()
                item['title'] = text
                item['href'] = dis_url
                item['release_year'] = 0
                item['from_performer_list'] = 0
                item['from_dist_list'] = 1
                item['from_stu_list'] = 0
                yield item
                yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
    def parse_studios_list_page(self, response):
        select_element = response.css('select[name="Studio"]')
        if select_element:
            options = select_element.css('option')
            for option in options:
                value = option.attrib.get('value')
                text = option.css('::text').get().strip()
                stu_url = self.host_url + f"/studio.rme/studio={value}"
                item = IAFDMovieItem()
                item['title'] = text
                item['href'] = stu_url
                item['release_year'] = 0
                item['from_performer_list'] = 0
                item['from_dist_list'] = 0
                item['from_stu_list'] = 1
                yield item
                yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
    def parse_person_detail_page(self, response):
        item = IAFDPersonDetailItem()
        item['href'] = response.url
        item['person'] = response.css('h1::text').get()  # 假设姓名在 h1 标签中
        # 解析其他详细信息，根据实际页面结构修改
        item['gender'] = response.css('span.gender::text').get()
        item['birthday'] = response.css('span.birthday::text').get()
        item['astrology'] = response.css('span.astrology::text').get()
        item['birthplace'] = response.css('span.birthplace::text').get()
        item['years_active'] = response.css('span.years_active::text').get()
        item['ethnicity'] = response.css('span.ethnicity::text').get()
        item['nationality'] = response.css('span.nationality::text').get()
        item['hair_colors'] = response.css('span.hair_colors::text').get()
        item['eye_color'] = response.css('span.eye_color::text').get()
        item['height'] = response.css('span.height::text').get()
        item['weight'] = response.css('span.weight::text').get()
        item['measurements'] = response.css('span.measurements::text').get()
        item['tattoos'] = response.css('span.tattoos::text').get()
        item['piercings'] = response.css('span.piercings::text').get()
        item['movies_cnt'] = response.css('span.movies_cnt::text').get()
        item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
        item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
        item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
        item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
        item['performer_aka'] = response.css('span.performer_aka::text').getall()
        yield item
    def parse_movie_detail_page(self, response):
        item = IAFDMovieDetailItem()
        item['title'] = response.css('h1::text').get()  # 假设标题在 h1 标签中
        item['href'] = response.url
        # 解析其他详细信息，根据实际页面结构修改
        yield item