diff --git a/scrapy_proj/cron/cron_scheduler.sh b/scrapy_proj/cron/cron_scheduler.sh index b027e3b..a74c20d 100755 --- a/scrapy_proj/cron/cron_scheduler.sh +++ b/scrapy_proj/cron/cron_scheduler.sh @@ -133,8 +133,9 @@ fi # 每月任务 if [ "${PERIOD}" = "--monthly" ]; then - register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' " - register_spider "pbox" "scrapy crawl javhd -a mod='update' " + register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' " + register_spider "javhd" "scrapy crawl javhd -a mod='update' " + register_spider "lord" "scrapy crawl lord -a mod='update' " fi diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index 7d0cddf..8c052bb 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -6,6 +6,7 @@ from datetime import datetime from typing import List, Dict from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path import scrapy_proj.comm.comm_def as comm +from scrapy_proj.utils.utils import pretty_json_simple # 注册器字典 spider_handler_registry = {} @@ -609,3 +610,61 @@ class JavHDDBHandler(SQLiteDBHandler): except sqlite3.Error as e: logging.error(f"query error: {e}") return 0 + + +@register_handler(comm.SPIDER_NAME_LORD) +class LordDBHandler(SQLiteDBHandler): + def __init__(self, db_path=shared_db_path): + super().__init__(db_path) + self.tbl_name_actors = 'thelordofporn_actress' + self.tbl_name_alias = 'thelordofporn_alias' + + def insert_item(self, item): + if item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL: + self.insert_actor(item) + else: + logging.error(f"unkown item.") + + return item + + def insert_actor(self, item): + actor_id = self.insert_or_update_common(item, self.tbl_name_actors, uniq_key='href', exists_do_nothing=False) + if actor_id: + for alias in item.get('alias', []): + alias_data = {'actress_id':actor_id, 'alias':alias} + affected_rows = self.insert_or_update_with_composite_pk(data=alias_data, tbl_name=self.tbl_name_alias, composite_pk=['actress_id','alias'], exists_do_nothing=False) + if affected_rows: + logging.debug(f"insert/update actress_alias. data: {alias_data}") + else: + logging.warning(f"insert actor alias error!. data: {alias_data}") + else: + logging.warning(f"insert actor data error! data: {pretty_json_simple(item)}") + + # 统计函数 + def get_stat(self): + try: + self.cursor.execute(f""" + SELECT + (SELECT COUNT(*) FROM {self.tbl_name_actors}) AS actor_cnt + """) + + row = self.cursor.fetchone() + if not row: + logging.warning(f"query no results.") + return {} + + columns = [desc[0] for desc in self.cursor.description] + return dict(zip(columns, row)) + + except sqlite3.Error as e: + logging.error(f"query error: {e}") + return {} + + def has_full_data(self, href): + try: + self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_actors} WHERE is_full_data=1 and href = ?", (href,)) + row = self.cursor.fetchone() + return row[0] if row else None + except sqlite3.Error as e: + logging.error(f"query error: {e}") + return 0 diff --git a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py index 75ce2ea..b74cf6b 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py +++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py @@ -188,6 +188,68 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类 logging.error(f"Error inserting or updating data: {e}") return None + def insert_or_update_with_composite_pk(self, data, tbl_name, composite_pk, exists_do_nothing=True): + """ + 针对联合主键表执行插入或更新操作 + + :param table_name: 表名 + :param data: 字典类型,待插入或更新的数据 + :param composite_pk: 列表类型,联合主键字段名集合 + :param need_update: 布尔值,记录存在时是否更新,默认True + :return: 操作影响的行数 + """ + try: + # 校验联合主键参数有效性 + if not isinstance(composite_pk, list) or len(composite_pk) < 2: + logging.error(f"联合主键必须是包含至少两个字段的列表: {composite_pk}") + return None + + processed_data = self.check_and_process_data(data, tbl_name) + + # 校验联合主键字段是否都在数据中存在 + for pk_field in composite_pk: + if pk_field not in processed_data: + logging.error(f"联合主键字段 '{pk_field}' 未在数据中提供") + return None + + # 构建查询条件 + where_conditions = " AND ".join([f"{pk} = ?" for pk in composite_pk]) + pk_values = [processed_data[pk] for pk in composite_pk] + + # 检查记录是否存在 + self.cursor.execute( + f"SELECT 1 FROM {tbl_name} WHERE {where_conditions}", + pk_values + ) + exists = self.cursor.fetchone() is not None + + if exists: + if exists_do_nothing: + return 0 + + # 构建更新字段(排除联合主键字段) + update_fields = [f for f in processed_data.keys() if f not in composite_pk] + if not update_fields: + return 0 + + set_clause = ", ".join([f"{field} = ?" for field in update_fields]) + update_values = [processed_data[field] for field in update_fields] + pk_values + + # 执行更新(兼容低版本SQLite的标准语法) + update_sql = f"UPDATE {tbl_name} SET {set_clause} WHERE {where_conditions}" + self.cursor.execute(update_sql, update_values) + return 1 + else: + # 执行插入操作 + columns = ", ".join(processed_data.keys()) + placeholders = ", ".join(["?" for _ in processed_data.keys()]) + insert_sql = f"INSERT INTO {tbl_name} ({columns}) VALUES ({placeholders})" + self.cursor.execute(insert_sql, list(processed_data.values())) + return 2 + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None + def get_id_by_key(self, tbl, uniq_key, val): self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,)) row = self.cursor.fetchone() diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index c922eed..bea1266 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -192,4 +192,33 @@ class JavHDActorItem(scrapy.Item): birth_date = scrapy.Field() ethnicity = scrapy.Field() birth_place = scrapy.Field() - is_full_data = scrapy.Field() \ No newline at end of file + is_full_data = scrapy.Field() + + +class LordActorItem(scrapy.Item): + item_type = scrapy.Field() + pornstar = scrapy.Field() + rating = scrapy.Field() + rank = scrapy.Field() + votes = scrapy.Field() + href = scrapy.Field() + career_start = scrapy.Field() + measurements = scrapy.Field() + born = scrapy.Field() + height = scrapy.Field() + weight = scrapy.Field() + date_modified = scrapy.Field() + global_rank = scrapy.Field() + weekly_rank = scrapy.Field() + last_month_rating = scrapy.Field() + current_rating = scrapy.Field() + total_votes = scrapy.Field() + birth_date = scrapy.Field() + birth_year = scrapy.Field() + birth_place = scrapy.Field() + height_ft = scrapy.Field() + height_cm = scrapy.Field() + weight_lbs = scrapy.Field() + weight_kg = scrapy.Field() + is_full_data = scrapy.Field() + alias = scrapy.Field() diff --git a/scrapy_proj/scrapy_proj/spiders/base_spider.py b/scrapy_proj/scrapy_proj/spiders/base_spider.py index 8fff7b7..a768752 100644 --- a/scrapy_proj/scrapy_proj/spiders/base_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/base_spider.py @@ -31,7 +31,7 @@ class BaseSpider(scrapy.Spider): yield request def parse(self, response): - """统一的响应处理入口""" + """统一的响应处理入口,实际上没有起作用,因为直接走了 scrapy.Request 里的 callback """ # 记录请求耗时 request_time = response.meta.get('request_time') if request_time: diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py index 9ea1bf8..f288a8b 100644 --- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py @@ -1,15 +1,19 @@ import scrapy import re +import sys +from urllib.parse import urljoin, quote_plus from scrapy_proj.spiders.base_spider import BaseSpider from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD +from scrapy_proj.spiders.parser.iafd_parser import common_parser +from scrapy_proj.utils.utils import pretty_json_simple db_tools = IAFDDBHandler() class IAFDSpider(BaseSpider): name = SPIDER_NAME_IAFD - allowed_domains = ["iafd.com"] + allowed_domains = ["iafd.com", "www.iafd.com"] host_url = "https://www.iafd.com" astr_base_url = f"{host_url}/astrology.rme/sign=" @@ -19,10 +23,10 @@ class IAFDSpider(BaseSpider): studios_list_url = f"{host_url}/studio.asp" ethnic_list_url = f'{host_url}/advsearch.asp' - def __init__(self, debug='false', cmd='', update='0', *args, **kwargs): + def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs): super().__init__(*args, **kwargs) self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False - self.update = int(update) + self.update_mode = True if mod and mod.lower() == 'update' else False self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") self.cmd_astro = 'astro' @@ -64,8 +68,9 @@ class IAFDSpider(BaseSpider): query_args = {} if self.debug: query_args['limit'] = 5 - if self.update == 0: + if self.update_mode: query_args['is_full_data'] = 0 + query_args['is_full_data'] = 404 # 读取待更新的演员列表 if self.cmd_performers in self.cmd_list: @@ -77,7 +82,7 @@ class IAFDSpider(BaseSpider): href = item.get('href', '') movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0 self.logger.info(f"fetch from db. item: {item}") - yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt}) + yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'}) # 读取待更新的影片列表 if self.cmd_movies in self.cmd_list: @@ -88,7 +93,7 @@ class IAFDSpider(BaseSpider): for item in movies: href = item.get('href', '') self.logger.info(f"fetch from db. item: {item}") - yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')}) + yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'}) def start_astro(self): @@ -113,50 +118,28 @@ class IAFDSpider(BaseSpider): yield request def parse_astro_page(self, response): - astro = response.meta['astro'] - astro_div = response.css('div#astro') - if astro_div: - birth_date = None - for elem in astro_div.css('*'): - if elem.css('h3.astroday'): - birth_date = elem.css('h3.astroday::text').get().strip() - elif elem.css('div.perficon'): - a_tag = elem.css('a') - if a_tag: - href = self.host_url + a_tag.attrib['href'] - name = a_tag.css('span.perfname::text').get() - if name: - item = IAFDPersonItem() - item['name'] = name - item['href'] = href - item['from_astro_list'] = 1 - item['from_birth_list'] = 0 - item['from_ethnic_list'] = 0 - item['from_movie_list'] = 0 - yield item - #yield scrapy.Request(href, callback=self.parse_person_detail_page) + astro = response.meta.get('astro', '') + data, next_url = common_parser(html=response.text, page='astro', astro=astro) + if data: + self.logger.debug(f"fetched data from {response.url}, data: {data}") + else: + self.logger.warning(f"parse data error. {response.url}") + item = IAFDPersonDetailItem() + #yield item + def parse_birth_page(self, response): month = response.meta['month'] day = response.meta['day'] - datarows = response.css('div.col-sm-12.col-lg-9') - if datarows: - rows = datarows[0].css('div.col-sm-4') - for row in rows: - link_tag = row.css('a') - person = link_tag.css('::text').get().strip() if link_tag else '' - href = self.host_url + link_tag.attrib['href'] if link_tag else '' - - item = IAFDPersonItem() - item['name'] = person - item['href'] = href - item['from_astro_list'] = 0 - item['from_birth_list'] = 1 - item['from_ethnic_list'] = 0 - item['from_movie_list'] = 0 - yield item - #yield scrapy.Request(href, callback=self.parse_person_detail_page) + data, next_url = common_parser(html=response.text, page='birth', month=month, day=day) + if data: + self.logger.debug(f"fetched data from {response.url}, data: {data}") + else: + self.logger.warning(f"parse data error. {response.url}") + item = IAFDPersonDetailItem() + #yield item + def parse_ethnic_list_page(self, response): div_root = response.css('select#ethnicity1') if div_root: @@ -167,40 +150,25 @@ class IAFDSpider(BaseSpider): href = option.attrib.get('value') text = option.css('::text').get().strip() if href and href.lower() != 'none': - ethnic_url = self.host_url + href + ethnic_url = urljoin(response.url , href) + self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}") yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text}) - if self.debug: - break def parse_ethnic_page(self, response): ethnic = response.meta['ethnic'] - rows = response.css('div.row.headshotrow') - for row in rows: - cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6') - for col in cols: - link_tag = col.css('a') - img_tag = col.css('div.pictag') - if link_tag and img_tag: - href = self.host_url + link_tag.attrib['href'] - person = img_tag.css('::text').get().strip() - - item = IAFDPersonItem() - item['name'] = person - item['href'] = href - item['from_astro_list'] = 0 - item['from_birth_list'] = 0 - item['from_ethnic_list'] = 1 - item['from_movie_list'] = 0 - yield item - #yield scrapy.Request(href, callback=self.parse_person_detail_page) - - next_page = response.css('a[rel="next"]') - if next_page: - next_url = self.host_url + next_page.attrib['href'] - yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic}) + data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic) + if data: + self.logger.debug(f"fetched data from {response.url}, data: {data}") else: - self.crawler.stats.inc_value(f"{self.name}/ethnic_done") - self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}") + self.logger.warning(f"parse data error. {response.url}") + + if next_url: + self.logger.info(f"find next page: {next_url}") + else: + self.logger.info(f"found all pages. url: {response.url}") + + item = IAFDPersonDetailItem() + #yield item def parse_distributors_list_page(self, response): select_element = response.css('select[name="Distrib"]') @@ -209,16 +177,8 @@ class IAFDSpider(BaseSpider): for option in options: value = option.attrib.get('value') text = option.css('::text').get().strip() - dis_url = self.host_url + f"/distrib.rme/distrib={value}" - item = IAFDMovieItem() - item['title'] = text - item['href'] = dis_url - item['release_year'] = 0 - item['from_performer_list'] = 0 - item['from_dist_list'] = 1 - item['from_stu_list'] = 0 - yield item - #yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page) + dis_url = f"{self.host_url}/distrib.rme/distrib={value}" + yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'}) def parse_studios_list_page(self, response): select_element = response.css('select[name="Studio"]') @@ -227,47 +187,54 @@ class IAFDSpider(BaseSpider): for option in options: value = option.attrib.get('value') text = option.css('::text').get().strip() - stu_url = self.host_url + f"/studio.rme/studio={value}" - item = IAFDMovieItem() - item['title'] = text - item['href'] = stu_url - item['release_year'] = 0 - item['from_performer_list'] = 0 - item['from_dist_list'] = 0 - item['from_stu_list'] = 1 - yield item - #yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page) + dis_url = f"{self.host_url}/studio.rme/studio={value}" + yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'}) + + def parse_stu_dist_page(self, response): + list_type = response.meta.get('list_type', '') + data, next_url = common_parser(html=response.text, page=list_type) + if data: + self.logger.debug(f"fetched data from {response.url}, data: {data}") + else: + self.logger.warning(f"fetched data error. {response.url}") + + item = IAFDPersonDetailItem() + #yield item + def parse_person_detail_page(self, response): + data = common_parser(html=response.text, page='actor', url=response.url) + if data: + self.logger.debug(f"fetched data from {response.url}, data: {data}") + else: + self.logger.warning(f"fetched data error. {response.url}") + item = IAFDPersonDetailItem() - item['href'] = response.url - item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中 - # 解析其他详细信息,根据实际页面结构修改 - item['gender'] = response.css('span.gender::text').get() - item['birthday'] = response.css('span.birthday::text').get() - item['astrology'] = response.css('span.astrology::text').get() - item['birthplace'] = response.css('span.birthplace::text').get() - item['years_active'] = response.css('span.years_active::text').get() - item['ethnicity'] = response.css('span.ethnicity::text').get() - item['nationality'] = response.css('span.nationality::text').get() - item['hair_colors'] = response.css('span.hair_colors::text').get() - item['eye_color'] = response.css('span.eye_color::text').get() - item['height'] = response.css('span.height::text').get() - item['weight'] = response.css('span.weight::text').get() - item['measurements'] = response.css('span.measurements::text').get() - item['tattoos'] = response.css('span.tattoos::text').get() - item['piercings'] = response.css('span.piercings::text').get() - item['movies_cnt'] = response.css('span.movies_cnt::text').get() - item['vixen_cnt'] = response.css('span.vixen_cnt::text').get() - item['blacked_cnt'] = response.css('span.blacked_cnt::text').get() - item['tushy_cnt'] = response.css('span.tushy_cnt::text').get() - item['x_art_cnt'] = response.css('span.x_art_cnt::text').get() - item['performer_aka'] = response.css('span.performer_aka::text').getall() - yield item + #yield item def parse_movie_detail_page(self, response): + title = response.meta.get('title', '') + data = common_parser(html=response.text, page='movies', href=response.url, title=title) + if data: + self.logger.debug(f"fetched data from {response.url}, data: {data}") + else: + self.logger.warning(f"fetched data error. {response.url}") + item = IAFDMovieDetailItem() - item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中 - item['href'] = response.url - # 解析其他详细信息,根据实际页面结构修改 - yield item \ No newline at end of file + #yield item + + def custom_block_check(self, response): + item_type = response.meta.get('item_type', '') + if "invalid or outdated page" in response.text.lower(): + self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}") + return "invalid or outdated page" + else: + self.logger.info(f"right content. url: {response.url}") + + return None + + # 处理页面异常,主要是404, 403 + def handle_blocked(self, response, reason): + item_type = response.meta.get('item_type', '') + if response.status in [404, 403]: + self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}") \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/spiders/javhd_spider.py b/scrapy_proj/scrapy_proj/spiders/javhd_spider.py index 5c6d1f7..cb7360a 100644 --- a/scrapy_proj/scrapy_proj/spiders/javhd_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/javhd_spider.py @@ -111,7 +111,7 @@ class JavhdSpider(BaseSpider): item['rank'] = rank item['url'] = url item[f'{lang}_name'] = name - #TODO: 非英语的页面,要去更新对应的名字 + # 非英语的页面,要去更新对应的名字 if lang != 'en': item['url'] = replace_lang_param(item['url']) yield item @@ -127,7 +127,7 @@ class JavhdSpider(BaseSpider): meta={"list_item": item} # 传递列表页数据到详情页 ) else: - self.logger.info(f"actor(name) has full data. skip. url: {url}") + self.logger.info(f"actor({name}) has full data. skip. url: {url}") # 获取下一页 next_path = data.get("pagination_params", {}).get("next") diff --git a/scrapy_proj/scrapy_proj/spiders/lord_spider.py b/scrapy_proj/scrapy_proj/spiders/lord_spider.py new file mode 100644 index 0000000..44d690e --- /dev/null +++ b/scrapy_proj/scrapy_proj/spiders/lord_spider.py @@ -0,0 +1,399 @@ +import scrapy +import sys +import re +from urllib.parse import urljoin, quote_plus +from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param, pretty_json_simple +from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element +from scrapy_proj.items import LordActorItem +from scrapy_proj.comm.comm_def import SPIDER_NAME_LORD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL +from scrapy_proj.db_wapper.spider_db_handler import LordDBHandler + +db_tools = LordDBHandler() + +class LordSpider(BaseSpider): + name = SPIDER_NAME_LORD + allowed_domains = ["www.thelordofporn.com", "thelordofporn.com"] + + # 配置请求头(复用curl中的头部信息) + custom_settings = { + "DEFAULT_REQUEST_HEADERS": { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", + "if-modified-since": "Wed, 23 Jul 2025 14:34:28 GMT", + "priority": "u=0, i", + "sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "none", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0" + }, + "COOKIES_ENABLED": True # 启用Cookie支持 + } + + def __init__(self, debug='false', mod='update', *args, **kwargs): + super().__init__(*args, **kwargs) + self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False + self.update_mod = False if mod and mod.lower() == 'force' else True + + self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") + + # 入口函数,由基类的方法触发 + def custom_start_requests(self): + url = 'https://thelordofporn.com/pornstars/' + yield scrapy.Request( + url=url, + headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头 + callback=self.parse_list, + meta={} # 传递列表页数据到详情页 + ) + + def parse_list(self, response): + # 提取所有演员条目(对应原代码中的article.loop-item) + articles = response.css("article.loop-item") + self.logger.info(f"当前页({response.url})找到 {len(articles)} 个演员条目") + + for article in articles: + try: + # 提取演员名称和详情页链接 + title_tag = article.css("h3.loop-item__title a") + title = title_tag.css("::text").get(default="N/A").strip() + href = title_tag.attrib.get("href") # 获取a标签的href属性 + + # 提取评分 + rating = article.css("div.loop-item__rating::text").get(default="N/A").strip() + + # 提取排名和投票数(对应原代码中的meta_tags) + meta_tags = article.css("div.loop-item__rank span") + rank = None + votes = None + + # 解析排名(第一个span中的b标签) + if len(meta_tags) >= 1: + rank_b = meta_tags[0].css("b::text").get() + rank = rank_b.strip() if rank_b else "N/A" + + # 解析投票数(第二个span中的b标签) + if len(meta_tags) >= 2: + votes_b = meta_tags[1].css("b::text").get() + votes = votes_b.strip() if votes_b else "N/A" + + # 转换为数值类型(模拟原代码中的utils.parse_numeric) + def parse_numeric(value): + if not value or value == "N/A": + return None + # 移除非数字字符(如逗号、%等) + numeric_str = ''.join(filter(str.isdigit, value)) + return int(numeric_str) if numeric_str else None + + # 构建演员数据字典 + actress_data = { + "pornstar": title, + "rating": parse_numeric(rating), + "rank": parse_numeric(rank), + "votes": parse_numeric(votes), + "href": href if href else None + } + # 发起详情查询 + actor_exists = 0 if not self.update_mod else db_tools.has_full_data(href) + if actor_exists < 1 : + yield scrapy.Request( + url=href, + callback=self.parse_actor_detail, + headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), + meta = {'actor':actress_data} + ) + else: + self.logger.info(f"actor({title}) has full data. skip. url: {href}") + + except Exception as e: + self.logger.error(f"解析演员条目失败: {e}, 页面: {response.url}") + continue # 跳过错误条目,继续解析下一个 + + # 提取下一页链接(对应原代码中的.next.page-numbers) + next_page_url = None + next_page_tag = response.css(".nav-links .next.page-numbers") + if next_page_tag: + next_page_href = next_page_tag.attrib.get("href") + if next_page_href and not self.debug: + # 拼接完整URL(处理相对路径) + next_page_url = urljoin(response.url, next_page_href) + yield scrapy.Request( + url=next_page_url, + callback=self.parse_list, + headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), + meta = {} + ) + else: + self.logger.info(f"已解析所有页面, current url: {response.url}") + + def parse_actor_detail(self, response): + # 1. 定义字段映射表:页面原始字段 -> Item字段 + FIELD_MAPPING = { + # 基本信息 + 'date_modified': 'date_modified', + # 排名信息 + 'Global Rank': 'global_rank', + 'Weekly Rank': 'weekly_rank', + # 评分信息 + 'Last Month': 'last_month_rating', + 'Rating Av.': 'current_rating', + 'Total of Votes': 'total_votes', + # 详细属性 + 'Career start': 'career_start', + 'Measurements': 'measurements', + 'Born': 'born', + 'Height': 'height', + 'Weight': 'weight', + 'Name': 'alias_raw', # 别名对应Name字段 + # 解析后字段(出生/身高/体重) + 'birth_date': 'birth_date', + 'birth_year': 'birth_year', + 'birth_place': 'birth_place', + 'height_ft': 'height_ft', + 'height_cm': 'height_cm', + 'weight_lbs': 'weight_lbs', + 'weight_kg': 'weight_kg', + 'alias':'alias' + } + + # 2. 初始化原始数据容器 + raw_data = {} + # 3. 提取基础信息 + raw_data['href'] = response.url + entry_header = response.css("header.entry-header") + raw_data['name'] = entry_header.css("h1.entry-title::text").get(default="").strip() + raw_data['date_modified'] = entry_header.css("time[itemprop='dateModified']::attr(content)").get(default="").strip() + + # 4. 提取排名信息 + for item in entry_header.css("div.porn-star-rank__item"): + item_text = item.css("::text").get(default="").strip() + raw_data[item_text] = self.parse_numeric(extract_text_from_element(item.css("b"))) + + # 5. 提取评分和投票信息 + for item in response.css("div.specifications__item--horizontal"): + # 1. 精准定位标题区域(排除b标签) + # 情况1:有子div的结构(如Rating Av.带img) + title_div = item.css("div:first-child") + if title_div: + # 只提取子div内的文本(自动排除同级的b标签) + title_parts = title_div.css("::text").getall() + else: + # 情况2和3:无子div的结构(Last Month和Total of Votes) + # 提取当前item内所有文本,但排除b标签的内容 + all_text_parts = item.css("::text").getall() + b_text_parts = item.css("b::text").getall() + # 从所有文本中移除b标签的文本 + title_parts = [t for t in all_text_parts if t not in b_text_parts] + + # 2. 清理标题文本(处理非断空格和空白) + title_text = "".join(title_parts) + title_text = title_text.replace(u'\xa0', u' ') # 替换非断空格 + title_text = re.sub(r'\s+', ' ', title_text).strip() # 合并空白 + + raw_data[title_text] = self.parse_numeric(extract_text_from_element(item.css("b"))) + + # 6. 提取详细属性(specifications-grid-row) + for row in response.css("div.specifications-grid-row"): + items = row.css("div.specifications-grid-item") + for i in [0, 1]: # 处理每行2个属性 + if i < len(items): + label = extract_text_from_element(items[i].css("h5")) + value = extract_text_from_element(items[i].css("span")) + if label: + raw_data[label] = value + + # 7. 处理特殊字段(别名需要清洗) + raw_data['alias'] = self.clean_alias(raw_data.get("Name", "")) + + # 9. 解析出生信息、身高、体重并合并 + raw_data.update(self.parse_birth_info(raw_data.get("Born", ""))) + raw_data.update(self.parse_height(raw_data.get("Height", ""))) + raw_data.update(self.parse_weight(raw_data.get("Weight", ""))) + + # 10. 映射到Item并返回 + item = LordActorItem() + item['item_type'] = ITEM_TYPE_ACTOR_DETAIL + actor_data = response.meta['actor'] + for k, v in actor_data.items(): + if k in item.fields: + item[k] = v + + for raw_field, item_field in FIELD_MAPPING.items(): + if item_field in item.fields: + item[item_field] = raw_data.get(raw_field, "") + + # 标记为完整数据 + item['is_full_data'] = 1 + self.logger.info(f"actor data: {raw_data}, meta: {response.meta['actor']}, item: {pretty_json_simple(item)}") + + yield item + + # 保留原工具函数(需作为Spider类的方法) + def parse_birth_info(self, text): + match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text, re.IGNORECASE) + if match: + return { + "birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}", + "birth_year": match.group(3), + "birth_place": match.group(4), + } + return {"birth_date": text, "birth_year": "", "birth_place": ""} + + + def parse_height2(self, text): + match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text, re.IGNORECASE) + if match: + height_ft = f"{match.group(1)}'{match.group(2)}\"" + return {"height_ft": height_ft.strip(), "height_cm": match.group(3)} + return {"height_ft": text, "height_cm": ""} + def parse_height(self, text): + # 统一预处理:替换逗号为小数点,处理常见笔误(如'n'→'in') + text = text.replace(',', '.').replace(' n ', ' in ').strip() + + # 正则表达式:匹配所有英尺+英寸格式(支持多种表达方式) + # 分组说明: + # 1. 英尺数值 2. 英尺单位(feet/ft/ft./') 3. 英寸数值 4. 英寸单位(inches/in/in./inch/") + # 5. 厘米/米数值 6. 单位(cm/m) + pattern = r""" + # 情况1:先英尺英寸,后厘米/米(主流格式) + (?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分(如5ft/5') + (?:and\s*)? # 可选的"and"(如5 feet and 2 inches) + (\d+)\s*(inches|in\.?|inch|")?\s* # 英寸部分(如2in/2") + (?:\(?(\d+\.?\d*)\s*(cm|m)\)?) # 厘米/米部分(如(157cm)/(1.57m)) + + | # 或 + + # 情况2:先厘米,后英尺英寸(如170 cm / 5 feet and 7 inches) + (\d+)\s*cm\s*/\s* # 厘米在前 + (?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分 + (?:and\s*)? + (\d+)\s*(inches|in\.?|inch|")? # 英寸部分 + + | # 或 + + # 情况3:纯简写格式(如5'3" (160 cm)) + (\d+)'(\d+)"\s*\(?(\d+)\s*cm\)? # 5'3"格式 + """ + + # 使用VERBOSE忽略正则中的空格,IGNORECASE忽略大小写 + match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE) + if not match: + # 处理纯厘米格式(如"160cm") + cm_match = re.match(r'(\d+)\s*cm', text, re.IGNORECASE) + if cm_match: + return {"height_ft": "", "height_cm": cm_match.group(1)} + return {"height_ft": text, "height_cm": ""} + + # 提取匹配结果(根据不同情况处理分组) + ft = None + inch = None + cm = None + + # 情况1:先英尺英寸后厘米/米 + if match.group(1) and match.group(3): + ft = match.group(1) + inch = match.group(3) + num = match.group(5) + unit = match.group(6).lower() if match.group(6) else 'cm' + + # 情况2:先厘米后英尺英寸 + elif match.group(7): + cm = match.group(7) + ft = match.group(8) + inch = match.group(10) + unit = 'cm' # 情况2中前面的单位固定为cm + + # 情况3:纯简写格式(5'3") + elif match.group(11) and match.group(12): + ft = match.group(11) + inch = match.group(12) + cm = match.group(13) + unit = 'cm' + + # 处理厘米/米转换(米转厘米) + if not cm and num and unit: + if unit == 'm': + cm = str(int(float(num) * 100)) # 1.57m → 157cm + else: + cm = num # 直接使用cm数值 + + # 格式化英尺英寸表达式(如5'2") + height_ft = f"{ft}'{inch}\"" if ft and inch else "" + + return {"height_ft": height_ft.strip(), "height_cm": cm.strip() if cm else ""} + + + def parse_weight2(self, text): + match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text, re.IGNORECASE) + if match: + return {"weight_lbs": match.group(1), "weight_kg": match.group(2)} + return {"weight_lbs": text, "weight_kg": ""} + + def parse_weight(self, text): + # 预处理:清理空格和常见格式问题 + text = text.strip().replace(' ', ' ') + + # 正则表达式:匹配多种体重格式 + # 分组说明: + # 1. 磅数值 2. 磅单位(lb/lbs/pounds) 3. 千克数值 4. 千克单位(kg) + # 5. 千克在前的数值 6. 千克单位 7. 磅在后的数值 8. 磅单位 + pattern = r""" + # 情况1:磅在前,千克在后(主流格式) + (?:(\d+)\s*(lb|lbs|pounds)?\s*) # 磅部分(支持lb/lbs/pounds或省略单位) + (?:\(?\s*(\d+)\s*(kg)\s*\)?) # 千克部分(如(45 kg)) + + | # 或 + + # 情况2:千克在前,磅在后(如52 kg / 114 lbs) + (?:(\d+)\s*(kg)\s*/\s*) # 千克部分 + (\d+)\s*(lb|lbs|pounds)? # 磅部分 + """ + + # 使用VERBOSE和IGNORECASE标志增强兼容性 + match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE) + if not match: + # 尝试匹配纯千克格式(如"52kg") + kg_match = re.match(r'(\d+)\s*kg', text, re.IGNORECASE) + if kg_match: + return {"weight_lbs": "", "weight_kg": kg_match.group(1)} + + # 尝试匹配纯磅格式(如"114lb") + lb_match = re.match(r'(\d+)\s*(lb|lbs|pounds)', text, re.IGNORECASE) + if lb_match: + return {"weight_lbs": lb_match.group(1), "weight_kg": ""} + + # 完全无法解析的情况 + return {"weight_lbs": text, "weight_kg": ""} + + # 提取匹配结果(根据不同情况处理分组) + weight_lbs = None + weight_kg = None + + # 情况1:磅在前,千克在后 + if match.group(1) and match.group(3): + weight_lbs = match.group(1) + weight_kg = match.group(3) + + # 情况2:千克在前,磅在后 + elif match.group(5) and match.group(6): + weight_kg = match.group(5) + weight_lbs = match.group(7) + + return { + "weight_lbs": weight_lbs.strip() if weight_lbs else "", + "weight_kg": weight_kg.strip() if weight_kg else "" + } + + def clean_alias(self, alias): + alias = re.sub(r'\(Age \d+\)', '', alias, re.IGNORECASE) + return [name.strip() for name in alias.split(',') if name.strip()] + + def parse_numeric(self, value): + try: + return float(value) + except (ValueError, TypeError): + return 0 diff --git a/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py b/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py new file mode 100644 index 0000000..3464612 --- /dev/null +++ b/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py @@ -0,0 +1,636 @@ + +import cloudscraper +import time +import json +import csv +import logging +import signal +import sys +import os +import re +from bs4 import BeautifulSoup +from requests.exceptions import RequestException +from functools import partial +#import config +#import utils + +# 定义基础 URL 和可变参数 +host_url = "https://www.iafd.com" + +astr_base_url = f"{host_url}/astrology.rme/sign=" +astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces'] + +birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}" + +distributors_list_url = f'{host_url}/distrib.asp' +distributors_base_url = f"{host_url}/distrib.rme/distrib=" + +studios_list_url = f"{host_url}/studio.asp" +studios_base_url = f"{host_url}/studio.rme/studio=" + +ethnic_list_url = f'{host_url}/advsearch.asp' + +# 设置 headers 和 scraper +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} +scraper = cloudscraper.create_scraper() + +http_code_404 = 404 +http_code_login = 401 +http_code_url = 601 +http_code_local = 99 + +save_raw_html = True +load_from_local = False + +def common_parser(html, page, **kwargs): + parser = "lxml" if page=='ethnic' else "html.parser" + soup = BeautifulSoup(html, parser) + if not soup: + return None + if page == 'astro': + #parse_page_astro(soup, astro): + return parse_page_astro(soup, **kwargs) + elif page == 'birth': + #parse_page_birth(soup, month, day): + return parse_page_birth(soup, **kwargs) + elif page == 'ethnic': + #parse_page_ethnic(soup, ethnic): + return parse_page_ethnic(soup, **kwargs) + elif page == 'dist': + return parse_page_dist_stu(soup,'distable') + elif page == 'stu': + return parse_page_dist_stu(soup,'studio') + elif page == 'actor': + #parse_page_performer(soup, url): + return parse_page_performer(soup, **kwargs) + elif page == 'movies': + #parse_page_movie(soup, href, title) + return parse_page_movie(soup, **kwargs) + else: + logging.warning(f"wrong page: {page}") + return None + +''' +#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 +def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): + if load_from_local: # 从本地读取的逻辑 + html = utils.read_raw_html(url) + if html: + # 预处理 HTML(如果提供了 preprocessor) + html_text = preprocessor(html) if preprocessor else html + + soup = BeautifulSoup(html_text, parser) + if validator(soup): # 进行自定义页面检查 + return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的 + + for attempt in range(max_retries): + try: + if host_url not in url.lower(): + logging.error(f'wrong url format: {url}') + return None, http_code_url + + response = scraper.get(url, headers=headers) + + # 处理 HTTP 状态码 + if response.status_code == 404: + logging.debug(f"Page not found (404): {url}") + return None, http_code_404 # 直接返回 404,调用方可以跳过 + + response.raise_for_status() # 处理 HTTP 错误 + + # 过期的网页,与404相同处理 + if "invalid or outdated page" in response.text.lower(): + logging.debug(f"invalid or outdated page: {url}") + return None, http_code_404 # 直接返回 404,调用方可以跳过 + + if save_raw_html: + utils.write_raw_html(url, response.text) + + # 预处理 HTML(如果提供了 preprocessor) + html_text = preprocessor(response.text) if preprocessor else response.text + + soup = BeautifulSoup(html_text, parser) + if validator(soup): # 进行自定义页面检查 + return soup, response.status_code + else: + # 检查是否发生跳转,比如到登录页面 + if response.history: + logging.warning(f"Page redirected on {url}. Validation failed.") + return None, http_code_login + + logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") + except cloudscraper.exceptions.CloudflareChallengeError as e: + logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...") + except cloudscraper.exceptions.CloudflareCode1020 as e: + logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...") + except Exception as e: + logging.error(f"Unexpected error on {url}: {e}, Retring...") + + logging.error(f'Fetching failed after max retries. {url}') + return None, None # 达到最大重试次数仍然失败 +''' + +# 修复 HTML 结构,去除多余标签并修正 标签,在获取人种的时候需要 +def preprocess_html(html): + return html.replace('
', '').replace('
标签 + options = div_root.find_all('option') + if options: + # 解析并输出 value 和文本内容 + for option in options: + href = option.get('value', None) + text = option.text.strip() + if href and href.lower() == 'none': + continue + list_data.append({ + "name": text, + "href": host_url + href if href else '' + }) + return list_data + + +# 解析 HTML 内容,提取需要的数据 +def parse_page_astro(soup, astro): + astro_div = soup.find("div", id="astro") + if not astro_div: + logging.warning(f"Warning: No 'astro' div found in {astro}") + return None, None + + flag = False + list_cnt = 0 + list_data = [] + next_url = None + + birth_date = None + for elem in astro_div.find_all(recursive=False): + if elem.name == "h3" and "astroday" in elem.get("class", []): + birth_date = elem.get_text(strip=True) + elif elem.name == "div" and "perficon" in elem.get("class", []): + a_tag = elem.find("a") + if a_tag: + href = host_url + a_tag["href"] + name = a_tag.find("span", class_="perfname") + if name: + list_data.append({ + "astrology": astro, + "birth_date": birth_date, + "person": name.get_text(strip=True), + "href": href + }) + flag = True + list_cnt = list_cnt +1 + if flag: + logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}") + return list_data, next_url + else: + return None, None + + +# 解析页面内容并更新birth_map +def parse_page_birth(soup, month, day): + datarows = soup.find_all('div', class_='col-sm-12 col-lg-9') + if not datarows: + return None, None + + flag = False + list_cnt = 0 + list_data = [] + next_url = None + rows = datarows[0].find_all('div', class_='col-sm-4') + for row in rows: + link_tag = row.find('a') + person = link_tag.text.strip() if link_tag else '' + href = link_tag['href'] if link_tag else '' + href = host_url + href + + # 如果 href 已经在 birth_map 中,跳过 + flag = True + if any(entry['href'] == href for entry in list_data): + continue + + # 将数据添加到 birth_map + list_data.append({ + 'month': month, + 'day': day, + 'person': person, + 'href': href + }) + list_cnt = list_cnt +1 + + if flag: + logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}") + return list_data, next_url + else: + return None, None + + +# 解析 HTML 内容,提取需要的数据 +def parse_page_ethnic(soup, ethnic): + rows = soup.find_all('div', class_='row headshotrow') + flag = False + list_data = [] + next_url = None + + for row in rows: + for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'): + link_tag = col.find('a') + img_tag = col.find('div', class_='pictag') + flag = True + + if link_tag and img_tag: + href = host_url + link_tag['href'] + person = img_tag.text.strip() + + # 将数据存储到 ethnic_map + list_data.append({ + 'ethnic': ethnic, + 'person': person, + 'href': href + }) + if flag: + logging.debug(f"get {len(list_data)} persons from this page.") + + next_page = soup.find('a', rel='next') + if next_page: + next_url = host_url + next_page['href'] + logging.debug(f"Found next page: {next_url}") + return list_data, next_url + else: + logging.debug(f"All pages fetched for {ethnic}.") + return list_data, None + else: + return None, None + +# 解析列表页 +def parse_page_dist_stu_list(soup, select_name): + list_data = [] + next_url = None + + select_element = soup.find('select', {'name': select_name}) + if select_element : + options = select_element.find_all('option') + for option in options: + value = option.get('value') # 获取 value 属性 + text = option.text.strip() # 获取文本内容 + list_data.append({ + 'name' : text, + 'href' : str(value) + }) + return list_data, next_url + else: + return None, None + +# 解析 HTML 内容,提取需要的数据 +def parse_page_dist_stu(soup, table_id): + table = soup.find("table", id=table_id) + if not table: + logging.warning(f"Warning: No {table_id} table found ") + return None, None + + # 找到thead并跳过 + thead = table.find('thead') + if thead: + thead.decompose() # 去掉thead部分,不需要解析 + + # 现在只剩下tbody部分 + tbody = table.find('tbody') + rows = tbody.find_all('tr') if tbody else [] + + list_data = [] + next_url = None + for row in rows: + cols = row.find_all('td') + if len(cols) >= 5: + title = cols[0].text.strip() + label = cols[1].text.strip() + year = cols[2].text.strip() + rev = cols[3].text.strip() + a_href = cols[0].find('a') + href = host_url + a_href['href'] if a_href else '' + + list_data.append({ + 'title': title, + 'label': label, + 'year': year, + 'rev': rev, + 'href': href + }) + return list_data, next_url + + +# 解析 作品列表,有个人出演,也有导演的 +def parse_credits_table(table, distributor_list): + # 找到thead并跳过 + thead = table.find('thead') + if thead: + thead.decompose() # 去掉thead部分,不需要解析 + + # 现在只剩下tbody部分 + tbody = table.find('tbody') + rows = tbody.find_all('tr') if tbody else [] + + movies = [] + distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数 + + # rows = table.find_all('tr', class_='we') + for row in rows: + #tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串 + tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串 + cols = row.find_all('td') + if len(cols) >= 6: + title = cols[0].text.strip() + href_a = cols[0].find('a') + href = href_a['href'] if href_a else '' + year = cols[1].text.strip() + distributor = cols[2].text.strip().lower() + href_d = cols[2].find('a') + href_dist = host_url + href_d['href'] if href_d else '' + notes = cols[3].text.strip() + rev = cols[4].text.strip() + formats = cols[5].text.strip() + + for key in distributor_list: + if key in distributor: + distributor_count[key] += 1 + + movies.append({ + 'title': title, + 'href' : href, + 'year': year, + 'distributor': distributor, + 'distributor_href': href_dist, + 'notes': notes, + 'rev': rev, + 'formats': formats, + 'tr_class': tr_class + }) + return movies, distributor_count + + +# 请求网页并提取所需数据 +def parse_page_performer(soup, url): + # 提取数据 + data = {} + + # 定义我们需要的字段名称和HTML中对应的标签 + fields = { + 'performer_aka': 'Performer AKA', + 'birthday': 'Birthday', + 'astrology': 'Astrology', + 'birthplace': 'Birthplace', + 'gender': 'Gender', + 'years_active': 'Years Active', + 'ethnicity': 'Ethnicity', + 'nationality': 'Nationality', + 'hair_colors': 'Hair Colors', + 'eye_color': 'Eye Color', + 'height': 'Height', + 'weight': 'Weight', + 'measurements': 'Measurements', + 'tattoos': 'Tattoos', + 'piercings': 'Piercings' + } + reversed_map = {v: k for k, v in fields.items()} + + # 解析表格数据, 获取参演或者导演的列表 + role_list = ['personal', 'directoral'] + distributor_list = ['vixen', 'blacked', 'tushy', 'x-art'] + credits_list = {} + + # 使用字典来存储统计 + distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数 + for role in role_list: + table = soup.find('table', id=role) + if table : + movies, stat_map = parse_credits_table(table, distributor_list) + credits_list[role] = movies + # 更新 distributor 统计 + for distributor in distributor_list: + distributor_count[distributor] += stat_map.get(distributor, 0) + + # 统计 movies 数量 + #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role]) + movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, [])) + + # 如果没有找到 + if len(credits_list) == 0 : + logging.warning(f"movie table empty. url: {url} ") + + # 遍历每个 bioheading, 获取metadata + bioheadings = soup.find_all('p', class_='bioheading') + for bio in bioheadings: + heading = bio.text.strip() + biodata = None + + # 如果包含 "Performer",需要特殊处理 + if 'Performer' in heading: + heading = 'Performer AKA' + biodata_div = bio.find_next('div', class_='biodata') + if biodata_div: + div_text = biodata_div.get_text(separator='|').strip() + biodata = [b.strip() for b in div_text.split('|') if b.strip()] + else: + biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else '' + + # 保存数据 + if heading in reversed_map: + kkey = reversed_map[heading] + data[kkey] = biodata + + # 添加统计数据到 data + data['movies_cnt'] = movies_cnt + data['vixen_cnt'] = distributor_count['vixen'] + data['blacked_cnt'] = distributor_count['blacked'] + data['tushy_cnt'] = distributor_count['tushy'] + data['x_art_cnt'] = distributor_count['x-art'] + data['credits'] = credits_list + + return data + + + +# 解析网页 HTML 并提取电影信息 +def parse_page_movie(soup, href, title): + # 解析电影基础信息 + movie_data = {} + info_div = soup.find("div", class_="col-xs-12 col-sm-3") + if info_div: + labels = info_div.find_all("p", class_="bioheading") + values = info_div.find_all("p", class_="biodata") + for label, value in zip(labels, values): + key = label.text.strip() + if key == "Directors": # 解析多位导演的情况 + directors = [] + links = value.find_all("a") + for link in links: + director_name = link.text.strip() + director_href = host_url + link['href'] if link['href'] else '' + directors.append({"name": director_name, "href": director_href}) + movie_data[key] = directors + else: + val = value.text.strip() + if key in ["Distributor", "Studio", "Director"]: + link = value.find("a") + if link: + val = link.text.strip() + movie_data[f'{key}Href'] = host_url + link['href'] + movie_data[key] = val + else: + return None + + # 解析演职人员信息 + performers = [] + cast_divs = soup.find_all("div", class_="castbox") + for cast in cast_divs: + performer = {} + link = cast.find("a") + if link: + performer["name"] = link.text.strip() + performer["href"] = host_url + link["href"] + + #performer["tags"] = [ + # tag.strip() for br in cast.find_all("br") + # if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip() + #] + + tags = [] + for br in cast.find_all("br"): + tag = br.next_sibling + if isinstance(tag, str) and tag.strip(): + tags.append(tag.strip()) + performer["tags"] = tags + + #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()] + performers.append(performer) + + # 解析场景拆解 + scene_breakdowns = [] + scene_table = soup.find("div", id="sceneinfo") + if scene_table: + rows = scene_table.find_all("tr") + + for row in rows: + cols = row.find_all("td") + if len(cols) >= 2: + scene = cols[0].text.strip() # 场景编号 + performer_info = cols[1] # 包含表演者及链接信息 + + # 获取
之前的完整 HTML(保留 标签等格式) + performer_html = str(performer_info) # 获取所有HTML内容 + split_html = performer_html.split("
") # 按
进行分割 + if split_html: + performers_html = split_html[0].strip() # 取
之前的部分 + else: + split_html = performer_html.split("
") # 按
进行分割 + if split_html: + performers_html = split_html[0].strip() # 取
之前的部分 + else: + performers_html = performer_html.strip() # 如果没有
,取全部 + + # 解析为纯文本(去除HTML标签,仅提取文本内容) + performers_soup = BeautifulSoup(performers_html, "html.parser") + performers_text = performers_soup.get_text() + + # 提取表演者 + scene_performers = [p.strip() for p in performers_text.split(",")] + + # 尝试获取 `webscene` 和 `studio` + links_data = {} + links = performer_info.find_all("a") + if links: + webscene_title = links[0].text.strip() if len(links)>0 else None + webscene = links[0]["href"] if len(links)>0 else None + studio = links[1].text.strip() if len(links)>1 else None + studio_lnk = links[1]["href"] if len(links)>1 else None + links_data = { + "title": webscene_title, + "webscene": webscene, + "studio": studio, + "studio_lnk": studio_lnk, + } + + scene_data = { + "scene": scene, + "performers": scene_performers, + **links_data, + } + scene_breakdowns.append(scene_data) + + appears_in = [] + appears_divs = soup.find("div", id="appearssection") + if appears_divs: + rows = appears_divs.find_all("li") + for row in rows: + lnk = row.find("a") + if lnk: + appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']}) + + + return { + "href": href, + "title": title, + "Minutes": movie_data.get("Minutes", ""), + "Distributor": movie_data.get("Distributor", ""), + "Studio": movie_data.get("Studio", ""), + "ReleaseDate": movie_data.get("Release Date", ""), + "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""), + "All-Girl": movie_data.get("All-Girl", ""), + "All-Male": movie_data.get("All-Male", ""), + "Compilation": movie_data.get("Compilation", ""), + "Webscene": movie_data.get("Webscene", ""), + "Director": movie_data.get("Director", ""), + "DirectorHref": movie_data.get("DirectorHref", ""), + "DistributorHref": movie_data.get("DistributorHref", ""), + "StudioHref": movie_data.get("StudioHref", ""), + "Directors": movie_data.get("Directors", []), # 可能存在的元素 + "Performers": performers, + "SceneBreakdowns": scene_breakdowns, + "AppearsIn": appears_in, + } + + +if __name__ == "__main__": + + for astro in astro_list: + url = astr_base_url + astro + next_url = url + logging.info(f"Fetching data for {astro}, url {url} ...") + + while True: + soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id")) + if soup: + list_data, next_url = parse_page_astro(soup, astro) + if list_data: + print(list_data[0] if len(list_data)>0 else 'no data') + break + else: + logging.info(f"Retrying {next_url} ...") + time.sleep(5) # 等待后再重试 + + time.sleep(2) # 控制访问频率 \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/utils/utils.py b/scrapy_proj/scrapy_proj/utils/utils.py index 042d2e3..a1e6557 100644 --- a/scrapy_proj/scrapy_proj/utils/utils.py +++ b/scrapy_proj/scrapy_proj/utils/utils.py @@ -129,3 +129,11 @@ def replace_lang_param(url: str) -> str: ) return urlunparse(new_parsed) +def pretty_json_simple(item): + try: + # 转换为单行JSON格式,需要保证传入的是map,不能是list + return json.dumps(dict(item), ensure_ascii=False, separators=(',', ':')) + except: + # 转换失败时返回原始字符串 + return item + \ No newline at end of file