From ff490462123d6b1e3b8efced585ec7aab108ecf3 Mon Sep 17 00:00:00 2001 From: oscarz Date: Thu, 3 Jul 2025 11:41:30 +0800 Subject: [PATCH] modify scripts --- .../scrapy_proj/db_wapper/iafd_query.py | 104 ++++++++ .../scrapy_proj/db_wapper/sqlite_base.py | 134 ++++++++++ scrapy_proj/scrapy_proj/items.py | 47 +++- scrapy_proj/scrapy_proj/middlewares.py | 65 +++++ scrapy_proj/scrapy_proj/pipelines.py | 148 ++--------- scrapy_proj/scrapy_proj/settings.py | 4 +- .../scrapy_proj/spiders/iafd_spider.py | 234 ++++++++++++++++++ 7 files changed, 608 insertions(+), 128 deletions(-) create mode 100644 scrapy_proj/scrapy_proj/db_wapper/iafd_query.py create mode 100644 scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py create mode 100644 scrapy_proj/scrapy_proj/spiders/iafd_spider.py diff --git a/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py b/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py new file mode 100644 index 0000000..ecf9b17 --- /dev/null +++ b/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py @@ -0,0 +1,104 @@ +import os +import sqlite3 +import logging +from datetime import datetime +from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path + + +class IAFDQuery(SQLiteDBHandler): + def __init__(self, db_path=shared_db_path): + super().__init__(db_path) + self.tbl_name_performers = 'iafd_performers' + self.tbl_name_movies = 'iafd_movies' + self.uniq_key = 'href' + + # 按条件查询 href 列表 + def get_performers(self, **filters): + try: + sql = f"SELECT href, name, id, movies_cnt FROM {self.tbl_name_performers} WHERE 1=1" + params = [] + + conditions = { + "id": " AND id = ?", + "href": " AND href = ?", + "name": " AND name LIKE ?", + "is_full_data": " AND is_full_data = ?", + "start_id": " AND id > ?", + } + + for key, condition in conditions.items(): + if key in filters: + sql += condition + if key == "name": + params.append(f"%{filters[key]}%") + else: + params.append(filters[key]) + + for key in ["is_full_data_in", "is_full_data_not_in"]: + if key in filters: + values = filters[key] + if values: + placeholders = ", ".join(["?"] * len(values)) + operator = "IN" if key == "is_full_data_in" else "NOT IN" + sql += f" AND is_full_data {operator} ({placeholders})" + params.extend(values) + + if "order_by" in filters: + # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理 + sql += f" ORDER BY {filters['order_by']} " + + if 'limit' in filters: + sql += " LIMIT ?" + params.append(filters["limit"]) + + self.cursor.execute(sql, params) + return [dict(row) for row in self.cursor.fetchall()] + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + + + # 按条件查询 href 列表 + def get_movies(self, **filters): + try: + sql = f"SELECT href, title, id FROM {self.tbl_name_performers} WHERE 1=1" + params = [] + + conditions = { + "id": " AND id = ?", + "href": " AND href = ?", + "title": " AND title LIKE ?", + "is_full_data": " AND is_full_data = ?", + "start_id": " AND id > ?", + } + + for key, condition in conditions.items(): + if key in filters: + sql += condition + if key == "name": + params.append(f"%{filters[key]}%") + else: + params.append(filters[key]) + + for key in ["is_full_data_in", "is_full_data_not_in"]: + if key in filters: + values = filters[key] + if values: + placeholders = ", ".join(["?"] * len(values)) + operator = "IN" if key == "is_full_data_in" else "NOT IN" + sql += f" AND is_full_data {operator} ({placeholders})" + params.extend(values) + + if "order_by" in filters: + # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理 + sql += f" ORDER BY {filters['order_by']} " + + if 'limit' in filters: + sql += " LIMIT ?" + params.append(filters["limit"]) + + self.cursor.execute(sql, params) + return [dict(row) for row in self.cursor.fetchall()] + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None diff --git a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py new file mode 100644 index 0000000..50184c9 --- /dev/null +++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py @@ -0,0 +1,134 @@ +import os +import sqlite3 +import logging +from datetime import datetime + +home_dir = os.path.expanduser("~") +global_share_data_dir = f'{home_dir}/sharedata' +default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db" +shared_db_path = f"{global_share_data_dir}/sqlite/shared.db" + +# 数据库基类,封装了通用的操作。 +class SQLiteDBHandler: + def __init__(self, db_path=None): + # 使用传入的 db_path 或默认路径 + self.DB_PATH = db_path or default_dbpath + + # 验证路径是否存在(可选) + if db_path and not os.path.exists(os.path.dirname(db_path)): + os.makedirs(os.path.dirname(db_path)) + + self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False) + self.conn.execute('PRAGMA journal_mode = WAL') # 启用 WAL(Write-Ahead Logging) 模式 + self.conn.commit() + + self.conn.row_factory = sqlite3.Row # 结果集支持字典式访问 + self.cursor = self.conn.cursor() + + # 检查 SQLite 版本 + self.lower_sqlite_version = False + sqlite_version = sqlite3.sqlite_version_info + if sqlite_version < (3, 24, 0): + self.lower_sqlite_version = True + + def get_table_columns_and_defaults(self, tbl_name): + try: + self.cursor.execute(f"PRAGMA table_info({tbl_name})") + columns = self.cursor.fetchall() + column_info = {} + for col in columns: + col_name = col[1] + default_value = col[4] + column_info[col_name] = default_value + return column_info + except sqlite3.Error as e: + logging.error(f"Error getting table columns: {e}") + return None + + def check_and_process_data(self, data, tbl_name): + column_info = self.get_table_columns_and_defaults(tbl_name) + if column_info is None: + return None + processed_data = {} + for col, default in column_info.items(): + if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值 + continue + if col == 'updated_at': # 日期函数,用户自己指定即可 + processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if col in data: + processed_data[col] = data[col] + + return processed_data + + def insert_or_update_common(self, data, tbl_name, uniq_key='url'): + if self.lower_sqlite_version: + return self.insert_or_update_common_lower(data, tbl_name, uniq_key) + + try: + processed_data = self.check_and_process_data(data, tbl_name) + if processed_data is None: + return None + + columns = ', '.join(processed_data.keys()) + values = list(processed_data.values()) + placeholders = ', '.join(['?' for _ in values]) + update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key]) + + sql = f''' + INSERT INTO {tbl_name} ({columns}) + VALUES ({placeholders}) + ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause} + ''' + self.cursor.execute(sql, values) + self.conn.commit() + + # 获取插入或更新后的记录 ID + self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],)) + record_id = self.cursor.fetchone()[0] + return record_id + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None + + def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'): + try: + processed_data = self.check_and_process_data(data, tbl_name) + if processed_data is None: + return None + + columns = ', '.join(processed_data.keys()) + values = list(processed_data.values()) + placeholders = ', '.join(['?' for _ in values]) + + # 先尝试插入数据 + try: + sql = f''' + INSERT INTO {tbl_name} ({columns}) + VALUES ({placeholders}) + ''' + self.cursor.execute(sql, values) + self.conn.commit() + except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作 + update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key]) + update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key] + update_values.append(data[uniq_key]) + sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?" + self.cursor.execute(sql, update_values) + self.conn.commit() + + # 获取插入或更新后的记录 ID + self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],)) + record_id = self.cursor.fetchone()[0] + return record_id + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None + + def get_id_by_key(self, tbl, uniq_key, val): + self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,)) + row = self.cursor.fetchone() + return row[0] if row else None + + def close(self): + self.cursor.close() + self.conn.close() diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index cd4e85f..984b6ff 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -19,4 +19,49 @@ class U001Item(scrapy.Item): class Sis001Item(scrapy.Item): title = scrapy.Field() url = scrapy.Field() - plate_name = scrapy.Field() \ No newline at end of file + plate_name = scrapy.Field() + +class IAFDPersonItem(scrapy.Item): + name = scrapy.Field() + href = scrapy.Field() + from_astro_list = scrapy.Field() + from_birth_list = scrapy.Field() + from_ethnic_list = scrapy.Field() + from_movie_list = scrapy.Field() + +class IAFDMovieItem(scrapy.Item): + title = scrapy.Field() + href = scrapy.Field() + release_year = scrapy.Field() + from_performer_list = scrapy.Field() + from_dist_list = scrapy.Field() + from_stu_list = scrapy.Field() + +class IAFDPersonDetailItem(scrapy.Item): + href = scrapy.Field() + person = scrapy.Field() + gender = scrapy.Field() + birthday = scrapy.Field() + astrology = scrapy.Field() + birthplace = scrapy.Field() + years_active = scrapy.Field() + ethnicity = scrapy.Field() + nationality = scrapy.Field() + hair_colors = scrapy.Field() + eye_color = scrapy.Field() + height = scrapy.Field() + weight = scrapy.Field() + measurements = scrapy.Field() + tattoos = scrapy.Field() + piercings = scrapy.Field() + movies_cnt = scrapy.Field() + vixen_cnt = scrapy.Field() + blacked_cnt = scrapy.Field() + tushy_cnt = scrapy.Field() + x_art_cnt = scrapy.Field() + performer_aka = scrapy.Field() + +class IAFDMovieDetailItem(scrapy.Item): + title = scrapy.Field() + href = scrapy.Field() + # 可以根据实际需求添加更多影片详情字段 \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/middlewares.py b/scrapy_proj/scrapy_proj/middlewares.py index a22d966..fdfd870 100644 --- a/scrapy_proj/scrapy_proj/middlewares.py +++ b/scrapy_proj/scrapy_proj/middlewares.py @@ -98,3 +98,68 @@ class ScrapyProjDownloaderMiddleware: def spider_opened(self, spider): spider.logger.info("Spider opened: %s" % spider.name) + + +import cloudscraper +from scrapy.http import TextResponse +import datetime +# 使用cloudscraper做代理,去请求网站 +class CloudScraperMiddleware: + def __init__(self, stats): + self.scraper = cloudscraper.create_scraper() + self.stats = stats # 注入统计对象 + # 指定需要使用 cloudscraper 的域名 + self.target_domains = {'iafd.com', 'another-domain.com'} + + # 设置 headers 和 scraper + self.ifad_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + @classmethod + def from_crawler(cls, crawler): + return cls( + stats=crawler.stats # 获取Scrapy统计对象 + ) + + def process_request(self, request, spider): + # 记录请求开始时间 + start_time = datetime.datetime.now() + + try: + # 发送请求 + response = self.scraper.get( + request.url, + headers=self.ifad_headers, + cookies=request.cookies + ) + + # 计算请求耗时(毫秒) + duration = (datetime.datetime.now() - start_time).total_seconds() * 1000 + + # 更新统计数据 + self.stats.inc_value('downloader/request_count') + self.stats.inc_value('downloader/request_method_count/GET') + self.stats.inc_value('downloader/request_bytes', len(str(request.headers)) + len(request.url)) + + self.stats.inc_value('downloader/response_count') + self.stats.inc_value(f'downloader/response_status_count/{response.status_code}') + self.stats.inc_value('downloader/response_bytes', len(response.content)) + + self.stats.set_value(f'response_received_count', self.stats.get_value('downloader/response_status_count/200', 0)) + + # 创建Scrapy响应对象 + return TextResponse( + url=response.url, + status=response.status_code, + body=response.content, + encoding=response.encoding, + request=request + ) + + except Exception as e: + # 记录错误 + self.stats.inc_value('downloader/exception_count') + self.stats.inc_value(f'downloader/exception_type_count/{e.__class__.__name__}') + spider.logger.error(f"CloudScraper请求失败: {e}") + return None # 失败时使用默认下载器 diff --git a/scrapy_proj/scrapy_proj/pipelines.py b/scrapy_proj/scrapy_proj/pipelines.py index 2f8b2cb..18a3e4e 100644 --- a/scrapy_proj/scrapy_proj/pipelines.py +++ b/scrapy_proj/scrapy_proj/pipelines.py @@ -15,132 +15,8 @@ import os import sqlite3 import logging from datetime import datetime -from scrapy_proj.items import U001Item, Sis001Item - -home_dir = os.path.expanduser("~") -global_share_data_dir = f'{home_dir}/sharedata' -default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db" - -# 数据库基类,封装了通用的操作。 -class SQLiteDBHandler: - def __init__(self, db_path=None): - # 使用传入的 db_path 或默认路径 - self.DB_PATH = db_path or default_dbpath - - # 验证路径是否存在(可选) - if db_path and not os.path.exists(os.path.dirname(db_path)): - os.makedirs(os.path.dirname(db_path)) - - self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False) - self.cursor = self.conn.cursor() - - # 检查 SQLite 版本 - self.lower_sqlite_version = False - sqlite_version = sqlite3.sqlite_version_info - if sqlite_version < (3, 24, 0): - self.lower_sqlite_version = True - - def get_table_columns_and_defaults(self, tbl_name): - try: - self.cursor.execute(f"PRAGMA table_info({tbl_name})") - columns = self.cursor.fetchall() - column_info = {} - for col in columns: - col_name = col[1] - default_value = col[4] - column_info[col_name] = default_value - return column_info - except sqlite3.Error as e: - logging.error(f"Error getting table columns: {e}") - return None - - def check_and_process_data(self, data, tbl_name): - column_info = self.get_table_columns_and_defaults(tbl_name) - if column_info is None: - return None - processed_data = {} - for col, default in column_info.items(): - if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值 - continue - if col == 'updated_at': # 日期函数,用户自己指定即可 - processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - if col in data: - processed_data[col] = data[col] - - return processed_data - - def insert_or_update_common(self, data, tbl_name, uniq_key='url'): - if self.lower_sqlite_version: - return self.insert_or_update_common_lower(data, tbl_name, uniq_key) - - try: - processed_data = self.check_and_process_data(data, tbl_name) - if processed_data is None: - return None - - columns = ', '.join(processed_data.keys()) - values = list(processed_data.values()) - placeholders = ', '.join(['?' for _ in values]) - update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key]) - - sql = f''' - INSERT INTO {tbl_name} ({columns}) - VALUES ({placeholders}) - ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause} - ''' - self.cursor.execute(sql, values) - self.conn.commit() - - # 获取插入或更新后的记录 ID - self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],)) - record_id = self.cursor.fetchone()[0] - return record_id - except sqlite3.Error as e: - logging.error(f"Error inserting or updating data: {e}") - return None - - def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'): - try: - processed_data = self.check_and_process_data(data, tbl_name) - if processed_data is None: - return None - - columns = ', '.join(processed_data.keys()) - values = list(processed_data.values()) - placeholders = ', '.join(['?' for _ in values]) - - # 先尝试插入数据 - try: - sql = f''' - INSERT INTO {tbl_name} ({columns}) - VALUES ({placeholders}) - ''' - self.cursor.execute(sql, values) - self.conn.commit() - except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作 - update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key]) - update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key] - update_values.append(data[uniq_key]) - sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?" - self.cursor.execute(sql, update_values) - self.conn.commit() - - # 获取插入或更新后的记录 ID - self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],)) - record_id = self.cursor.fetchone()[0] - return record_id - except sqlite3.Error as e: - logging.error(f"Error inserting or updating data: {e}") - return None - - def get_id_by_key(self, tbl, uniq_key, val): - self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,)) - row = self.cursor.fetchone() - return row[0] if row else None - - def close(self): - self.cursor.close() - self.conn.close() +from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem +from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler class SQLitePipeline(SQLiteDBHandler): def __init__(self, db_path=None): @@ -188,6 +64,14 @@ class SQLitePipeline(SQLiteDBHandler): self._process_u001_item(item) elif isinstance(item, Sis001Item): self._process_sis001_item(item) + elif isinstance(item, IAFDPersonItem): + self._process_iafd_person_item(item) + elif isinstance(item, IAFDPersonDetailItem): + self._process_iafd_person_detail_item(item) + elif isinstance(item, IAFDMovieItem): + self._process_iafd_movie_item(item) + elif isinstance(item, IAFDMovieDetailItem): + self._process_iafd_movie_detail_item(item) return item def _process_u001_item(self, item): @@ -205,5 +89,17 @@ class SQLitePipeline(SQLiteDBHandler): )) self.conn.commit() + def _process_iafd_person_item(self, item): + logging.info(f"deal with persion item. {item}") + + def _process_iafd_movie_item(self, item): + logging.info(f"deal with movie item. {item}") + + def _process_iafd_person_detail_item(self, item): + logging.info(f"deal with persion item. {item}") + + def _process_iafd_movie_detail_item(self, item): + logging.info(f"deal with movie item. {item}") + def close_spider(self, spider): self.conn.close() \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/settings.py b/scrapy_proj/scrapy_proj/settings.py index 7cc73ba..b020070 100644 --- a/scrapy_proj/scrapy_proj/settings.py +++ b/scrapy_proj/scrapy_proj/settings.py @@ -30,6 +30,7 @@ ADDONS = {} # 并发设置 CONCURRENT_REQUESTS = 1 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 CONCURRENT_ITEMS = 100 # 下载延迟 @@ -51,6 +52,7 @@ USER_AGENT_LIST = [ DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': None, + 'scrapy_proj.middlewares.CloudScraperMiddleware': 543, } # settings.py @@ -66,7 +68,7 @@ STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensio #USER_AGENT = "scrapy_proj (+http://www.yourdomain.com)" # Obey robots.txt rules -ROBOTSTXT_OBEY = True +#ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py new file mode 100644 index 0000000..d28f324 --- /dev/null +++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py @@ -0,0 +1,234 @@ +import scrapy +import re +import logging +from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem +from scrapy_proj.db_wapper.iafd_query import IAFDQuery + +db_tools = IAFDQuery() + +class IAFDSpider(scrapy.Spider): + name = "iafd" + allowed_domains = ["iafd.com"] + + host_url = "https://www.iafd.com" + astr_base_url = f"{host_url}/astrology.rme/sign=" + astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces'] + birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}" + distributors_list_url = f'{host_url}/distrib.asp' + studios_list_url = f"{host_url}/studio.asp" + ethnic_list_url = f'{host_url}/advsearch.asp' + + def __init__(self, debug='false', cmd='', update='0', *args, **kwargs): + super().__init__(*args, **kwargs) + self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False + self.cmd_list = cmd + self.update = int(update) + + def start_requests(self): + # 按星座获取演员列表 + for astro in self.astro_list: + url = self.astr_base_url + astro + yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro}) + if self.debug: + break + + # 按生日获取演员列表 + for month in range(1, 13): + for day in range(1, 32): + url = self.birth_base_url.format(month=month, day=day) + yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day}) + if self.debug: + break + + # 获取人种列表 + yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page) + + # 获取 distributors 列表 + yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page) + + # 获取 studios 列表 + yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page) + + query_args = {} + if self.debug: + query_args['limit'] = 5 + if self.update == 0: + query_args['is_full_data'] = 0 + + # 读取待更新的演员列表 + actors = db_tools.get_performers(**query_args) + if actors: + for item in actors: + href = item.get('href', '') + movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0 + logging.info(f"fetch from db. item: {item}") + yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt}) + + # 读取待更新的影片列表 + movies = db_tools.get_movies(**query_args) + if movies: + for item in movies: + href = item.get('href', '') + logging.info(f"fetch from db. item: {item}") + yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')}) + + + async def start(self): + # 调用原有 start_requests 方法 + async for request in super().start(): + yield request + + def parse_astro_page(self, response): + astro = response.meta['astro'] + astro_div = response.css('div#astro') + if astro_div: + birth_date = None + for elem in astro_div.css('*'): + if elem.css('h3.astroday'): + birth_date = elem.css('h3.astroday::text').get().strip() + elif elem.css('div.perficon'): + a_tag = elem.css('a') + if a_tag: + href = self.host_url + a_tag.attrib['href'] + name = a_tag.css('span.perfname::text').get() + if name: + item = IAFDPersonItem() + item['name'] = name + item['href'] = href + item['from_astro_list'] = 1 + item['from_birth_list'] = 0 + item['from_ethnic_list'] = 0 + item['from_movie_list'] = 0 + yield item + #yield scrapy.Request(href, callback=self.parse_person_detail_page) + + def parse_birth_page(self, response): + month = response.meta['month'] + day = response.meta['day'] + datarows = response.css('div.col-sm-12.col-lg-9') + if datarows: + rows = datarows[0].css('div.col-sm-4') + for row in rows: + link_tag = row.css('a') + person = link_tag.css('::text').get().strip() if link_tag else '' + href = self.host_url + link_tag.attrib['href'] if link_tag else '' + + item = IAFDPersonItem() + item['name'] = person + item['href'] = href + item['from_astro_list'] = 0 + item['from_birth_list'] = 1 + item['from_ethnic_list'] = 0 + item['from_movie_list'] = 0 + yield item + #yield scrapy.Request(href, callback=self.parse_person_detail_page) + + def parse_ethnic_list_page(self, response): + div_root = response.css('select#ethnicity1') + if div_root: + options = div_root.css('option') + for option in options: + href = option.attrib.get('value') + text = option.css('::text').get().strip() + if href and href.lower() != 'none': + ethnic_url = self.host_url + href + yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text}) + if self.debug: + break + + def parse_ethnic_page(self, response): + ethnic = response.meta['ethnic'] + rows = response.css('div.row.headshotrow') + for row in rows: + cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6') + for col in cols: + link_tag = col.css('a') + img_tag = col.css('div.pictag') + if link_tag and img_tag: + href = self.host_url + link_tag.attrib['href'] + person = img_tag.css('::text').get().strip() + + item = IAFDPersonItem() + item['name'] = person + item['href'] = href + item['from_astro_list'] = 0 + item['from_birth_list'] = 0 + item['from_ethnic_list'] = 1 + item['from_movie_list'] = 0 + yield item + #yield scrapy.Request(href, callback=self.parse_person_detail_page) + + next_page = response.css('a[rel="next"]') + if next_page: + next_url = self.host_url + next_page.attrib['href'] + yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic}) + + def parse_distributors_list_page(self, response): + select_element = response.css('select[name="Distrib"]') + if select_element: + options = select_element.css('option') + for option in options: + value = option.attrib.get('value') + text = option.css('::text').get().strip() + dis_url = self.host_url + f"/distrib.rme/distrib={value}" + item = IAFDMovieItem() + item['title'] = text + item['href'] = dis_url + item['release_year'] = 0 + item['from_performer_list'] = 0 + item['from_dist_list'] = 1 + item['from_stu_list'] = 0 + yield item + yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page) + + def parse_studios_list_page(self, response): + select_element = response.css('select[name="Studio"]') + if select_element: + options = select_element.css('option') + for option in options: + value = option.attrib.get('value') + text = option.css('::text').get().strip() + stu_url = self.host_url + f"/studio.rme/studio={value}" + item = IAFDMovieItem() + item['title'] = text + item['href'] = stu_url + item['release_year'] = 0 + item['from_performer_list'] = 0 + item['from_dist_list'] = 0 + item['from_stu_list'] = 1 + yield item + yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page) + + def parse_person_detail_page(self, response): + item = IAFDPersonDetailItem() + item['href'] = response.url + item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中 + # 解析其他详细信息,根据实际页面结构修改 + item['gender'] = response.css('span.gender::text').get() + item['birthday'] = response.css('span.birthday::text').get() + item['astrology'] = response.css('span.astrology::text').get() + item['birthplace'] = response.css('span.birthplace::text').get() + item['years_active'] = response.css('span.years_active::text').get() + item['ethnicity'] = response.css('span.ethnicity::text').get() + item['nationality'] = response.css('span.nationality::text').get() + item['hair_colors'] = response.css('span.hair_colors::text').get() + item['eye_color'] = response.css('span.eye_color::text').get() + item['height'] = response.css('span.height::text').get() + item['weight'] = response.css('span.weight::text').get() + item['measurements'] = response.css('span.measurements::text').get() + item['tattoos'] = response.css('span.tattoos::text').get() + item['piercings'] = response.css('span.piercings::text').get() + item['movies_cnt'] = response.css('span.movies_cnt::text').get() + item['vixen_cnt'] = response.css('span.vixen_cnt::text').get() + item['blacked_cnt'] = response.css('span.blacked_cnt::text').get() + item['tushy_cnt'] = response.css('span.tushy_cnt::text').get() + item['x_art_cnt'] = response.css('span.x_art_cnt::text').get() + item['performer_aka'] = response.css('span.performer_aka::text').getall() + yield item + + def parse_movie_detail_page(self, response): + item = IAFDMovieDetailItem() + item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中 + item['href'] = response.url + # 解析其他详细信息,根据实际页面结构修改 + yield item \ No newline at end of file