modify scripts
This commit is contained in:
@ -134,7 +134,8 @@ fi
|
|||||||
# 每月任务
|
# 每月任务
|
||||||
if [ "${PERIOD}" = "--monthly" ]; then
|
if [ "${PERIOD}" = "--monthly" ]; then
|
||||||
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||||
register_spider "pbox" "scrapy crawl javhd -a mod='update' "
|
register_spider "javhd" "scrapy crawl javhd -a mod='update' "
|
||||||
|
register_spider "lord" "scrapy crawl lord -a mod='update' "
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -6,6 +6,7 @@ from datetime import datetime
|
|||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
|
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
|
||||||
import scrapy_proj.comm.comm_def as comm
|
import scrapy_proj.comm.comm_def as comm
|
||||||
|
from scrapy_proj.utils.utils import pretty_json_simple
|
||||||
|
|
||||||
# 注册器字典
|
# 注册器字典
|
||||||
spider_handler_registry = {}
|
spider_handler_registry = {}
|
||||||
@ -609,3 +610,61 @@ class JavHDDBHandler(SQLiteDBHandler):
|
|||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logging.error(f"query error: {e}")
|
logging.error(f"query error: {e}")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
@register_handler(comm.SPIDER_NAME_LORD)
|
||||||
|
class LordDBHandler(SQLiteDBHandler):
|
||||||
|
def __init__(self, db_path=shared_db_path):
|
||||||
|
super().__init__(db_path)
|
||||||
|
self.tbl_name_actors = 'thelordofporn_actress'
|
||||||
|
self.tbl_name_alias = 'thelordofporn_alias'
|
||||||
|
|
||||||
|
def insert_item(self, item):
|
||||||
|
if item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL:
|
||||||
|
self.insert_actor(item)
|
||||||
|
else:
|
||||||
|
logging.error(f"unkown item.")
|
||||||
|
|
||||||
|
return item
|
||||||
|
|
||||||
|
def insert_actor(self, item):
|
||||||
|
actor_id = self.insert_or_update_common(item, self.tbl_name_actors, uniq_key='href', exists_do_nothing=False)
|
||||||
|
if actor_id:
|
||||||
|
for alias in item.get('alias', []):
|
||||||
|
alias_data = {'actress_id':actor_id, 'alias':alias}
|
||||||
|
affected_rows = self.insert_or_update_with_composite_pk(data=alias_data, tbl_name=self.tbl_name_alias, composite_pk=['actress_id','alias'], exists_do_nothing=False)
|
||||||
|
if affected_rows:
|
||||||
|
logging.debug(f"insert/update actress_alias. data: {alias_data}")
|
||||||
|
else:
|
||||||
|
logging.warning(f"insert actor alias error!. data: {alias_data}")
|
||||||
|
else:
|
||||||
|
logging.warning(f"insert actor data error! data: {pretty_json_simple(item)}")
|
||||||
|
|
||||||
|
# 统计函数
|
||||||
|
def get_stat(self):
|
||||||
|
try:
|
||||||
|
self.cursor.execute(f"""
|
||||||
|
SELECT
|
||||||
|
(SELECT COUNT(*) FROM {self.tbl_name_actors}) AS actor_cnt
|
||||||
|
""")
|
||||||
|
|
||||||
|
row = self.cursor.fetchone()
|
||||||
|
if not row:
|
||||||
|
logging.warning(f"query no results.")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
columns = [desc[0] for desc in self.cursor.description]
|
||||||
|
return dict(zip(columns, row))
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"query error: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def has_full_data(self, href):
|
||||||
|
try:
|
||||||
|
self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_actors} WHERE is_full_data=1 and href = ?", (href,))
|
||||||
|
row = self.cursor.fetchone()
|
||||||
|
return row[0] if row else None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"query error: {e}")
|
||||||
|
return 0
|
||||||
|
|||||||
@ -188,6 +188,68 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
|
|||||||
logging.error(f"Error inserting or updating data: {e}")
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def insert_or_update_with_composite_pk(self, data, tbl_name, composite_pk, exists_do_nothing=True):
|
||||||
|
"""
|
||||||
|
针对联合主键表执行插入或更新操作
|
||||||
|
|
||||||
|
:param table_name: 表名
|
||||||
|
:param data: 字典类型,待插入或更新的数据
|
||||||
|
:param composite_pk: 列表类型,联合主键字段名集合
|
||||||
|
:param need_update: 布尔值,记录存在时是否更新,默认True
|
||||||
|
:return: 操作影响的行数
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 校验联合主键参数有效性
|
||||||
|
if not isinstance(composite_pk, list) or len(composite_pk) < 2:
|
||||||
|
logging.error(f"联合主键必须是包含至少两个字段的列表: {composite_pk}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
processed_data = self.check_and_process_data(data, tbl_name)
|
||||||
|
|
||||||
|
# 校验联合主键字段是否都在数据中存在
|
||||||
|
for pk_field in composite_pk:
|
||||||
|
if pk_field not in processed_data:
|
||||||
|
logging.error(f"联合主键字段 '{pk_field}' 未在数据中提供")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 构建查询条件
|
||||||
|
where_conditions = " AND ".join([f"{pk} = ?" for pk in composite_pk])
|
||||||
|
pk_values = [processed_data[pk] for pk in composite_pk]
|
||||||
|
|
||||||
|
# 检查记录是否存在
|
||||||
|
self.cursor.execute(
|
||||||
|
f"SELECT 1 FROM {tbl_name} WHERE {where_conditions}",
|
||||||
|
pk_values
|
||||||
|
)
|
||||||
|
exists = self.cursor.fetchone() is not None
|
||||||
|
|
||||||
|
if exists:
|
||||||
|
if exists_do_nothing:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# 构建更新字段(排除联合主键字段)
|
||||||
|
update_fields = [f for f in processed_data.keys() if f not in composite_pk]
|
||||||
|
if not update_fields:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
set_clause = ", ".join([f"{field} = ?" for field in update_fields])
|
||||||
|
update_values = [processed_data[field] for field in update_fields] + pk_values
|
||||||
|
|
||||||
|
# 执行更新(兼容低版本SQLite的标准语法)
|
||||||
|
update_sql = f"UPDATE {tbl_name} SET {set_clause} WHERE {where_conditions}"
|
||||||
|
self.cursor.execute(update_sql, update_values)
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
# 执行插入操作
|
||||||
|
columns = ", ".join(processed_data.keys())
|
||||||
|
placeholders = ", ".join(["?" for _ in processed_data.keys()])
|
||||||
|
insert_sql = f"INSERT INTO {tbl_name} ({columns}) VALUES ({placeholders})"
|
||||||
|
self.cursor.execute(insert_sql, list(processed_data.values()))
|
||||||
|
return 2
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def get_id_by_key(self, tbl, uniq_key, val):
|
def get_id_by_key(self, tbl, uniq_key, val):
|
||||||
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
|
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
|
||||||
row = self.cursor.fetchone()
|
row = self.cursor.fetchone()
|
||||||
|
|||||||
@ -193,3 +193,32 @@ class JavHDActorItem(scrapy.Item):
|
|||||||
ethnicity = scrapy.Field()
|
ethnicity = scrapy.Field()
|
||||||
birth_place = scrapy.Field()
|
birth_place = scrapy.Field()
|
||||||
is_full_data = scrapy.Field()
|
is_full_data = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class LordActorItem(scrapy.Item):
|
||||||
|
item_type = scrapy.Field()
|
||||||
|
pornstar = scrapy.Field()
|
||||||
|
rating = scrapy.Field()
|
||||||
|
rank = scrapy.Field()
|
||||||
|
votes = scrapy.Field()
|
||||||
|
href = scrapy.Field()
|
||||||
|
career_start = scrapy.Field()
|
||||||
|
measurements = scrapy.Field()
|
||||||
|
born = scrapy.Field()
|
||||||
|
height = scrapy.Field()
|
||||||
|
weight = scrapy.Field()
|
||||||
|
date_modified = scrapy.Field()
|
||||||
|
global_rank = scrapy.Field()
|
||||||
|
weekly_rank = scrapy.Field()
|
||||||
|
last_month_rating = scrapy.Field()
|
||||||
|
current_rating = scrapy.Field()
|
||||||
|
total_votes = scrapy.Field()
|
||||||
|
birth_date = scrapy.Field()
|
||||||
|
birth_year = scrapy.Field()
|
||||||
|
birth_place = scrapy.Field()
|
||||||
|
height_ft = scrapy.Field()
|
||||||
|
height_cm = scrapy.Field()
|
||||||
|
weight_lbs = scrapy.Field()
|
||||||
|
weight_kg = scrapy.Field()
|
||||||
|
is_full_data = scrapy.Field()
|
||||||
|
alias = scrapy.Field()
|
||||||
|
|||||||
@ -31,7 +31,7 @@ class BaseSpider(scrapy.Spider):
|
|||||||
yield request
|
yield request
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
"""统一的响应处理入口"""
|
"""统一的响应处理入口,实际上没有起作用,因为直接走了 scrapy.Request 里的 callback """
|
||||||
# 记录请求耗时
|
# 记录请求耗时
|
||||||
request_time = response.meta.get('request_time')
|
request_time = response.meta.get('request_time')
|
||||||
if request_time:
|
if request_time:
|
||||||
|
|||||||
@ -1,15 +1,19 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
from urllib.parse import urljoin, quote_plus
|
||||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
from scrapy_proj.spiders.base_spider import BaseSpider
|
||||||
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
||||||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
||||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
||||||
|
from scrapy_proj.spiders.parser.iafd_parser import common_parser
|
||||||
|
from scrapy_proj.utils.utils import pretty_json_simple
|
||||||
|
|
||||||
db_tools = IAFDDBHandler()
|
db_tools = IAFDDBHandler()
|
||||||
|
|
||||||
class IAFDSpider(BaseSpider):
|
class IAFDSpider(BaseSpider):
|
||||||
name = SPIDER_NAME_IAFD
|
name = SPIDER_NAME_IAFD
|
||||||
allowed_domains = ["iafd.com"]
|
allowed_domains = ["iafd.com", "www.iafd.com"]
|
||||||
|
|
||||||
host_url = "https://www.iafd.com"
|
host_url = "https://www.iafd.com"
|
||||||
astr_base_url = f"{host_url}/astrology.rme/sign="
|
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||||||
@ -19,10 +23,10 @@ class IAFDSpider(BaseSpider):
|
|||||||
studios_list_url = f"{host_url}/studio.asp"
|
studios_list_url = f"{host_url}/studio.asp"
|
||||||
ethnic_list_url = f'{host_url}/advsearch.asp'
|
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||||||
|
|
||||||
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
|
def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||||
self.update = int(update)
|
self.update_mode = True if mod and mod.lower() == 'update' else False
|
||||||
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||||||
|
|
||||||
self.cmd_astro = 'astro'
|
self.cmd_astro = 'astro'
|
||||||
@ -64,8 +68,9 @@ class IAFDSpider(BaseSpider):
|
|||||||
query_args = {}
|
query_args = {}
|
||||||
if self.debug:
|
if self.debug:
|
||||||
query_args['limit'] = 5
|
query_args['limit'] = 5
|
||||||
if self.update == 0:
|
if self.update_mode:
|
||||||
query_args['is_full_data'] = 0
|
query_args['is_full_data'] = 0
|
||||||
|
query_args['is_full_data'] = 404
|
||||||
|
|
||||||
# 读取待更新的演员列表
|
# 读取待更新的演员列表
|
||||||
if self.cmd_performers in self.cmd_list:
|
if self.cmd_performers in self.cmd_list:
|
||||||
@ -77,7 +82,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
href = item.get('href', '')
|
href = item.get('href', '')
|
||||||
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
|
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
|
||||||
self.logger.info(f"fetch from db. item: {item}")
|
self.logger.info(f"fetch from db. item: {item}")
|
||||||
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
|
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'})
|
||||||
|
|
||||||
# 读取待更新的影片列表
|
# 读取待更新的影片列表
|
||||||
if self.cmd_movies in self.cmd_list:
|
if self.cmd_movies in self.cmd_list:
|
||||||
@ -88,7 +93,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
for item in movies:
|
for item in movies:
|
||||||
href = item.get('href', '')
|
href = item.get('href', '')
|
||||||
self.logger.info(f"fetch from db. item: {item}")
|
self.logger.info(f"fetch from db. item: {item}")
|
||||||
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
|
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'})
|
||||||
|
|
||||||
|
|
||||||
def start_astro(self):
|
def start_astro(self):
|
||||||
@ -113,49 +118,27 @@ class IAFDSpider(BaseSpider):
|
|||||||
yield request
|
yield request
|
||||||
|
|
||||||
def parse_astro_page(self, response):
|
def parse_astro_page(self, response):
|
||||||
astro = response.meta['astro']
|
astro = response.meta.get('astro', '')
|
||||||
astro_div = response.css('div#astro')
|
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
|
||||||
if astro_div:
|
if data:
|
||||||
birth_date = None
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
for elem in astro_div.css('*'):
|
else:
|
||||||
if elem.css('h3.astroday'):
|
self.logger.warning(f"parse data error. {response.url}")
|
||||||
birth_date = elem.css('h3.astroday::text').get().strip()
|
|
||||||
elif elem.css('div.perficon'):
|
item = IAFDPersonDetailItem()
|
||||||
a_tag = elem.css('a')
|
#yield item
|
||||||
if a_tag:
|
|
||||||
href = self.host_url + a_tag.attrib['href']
|
|
||||||
name = a_tag.css('span.perfname::text').get()
|
|
||||||
if name:
|
|
||||||
item = IAFDPersonItem()
|
|
||||||
item['name'] = name
|
|
||||||
item['href'] = href
|
|
||||||
item['from_astro_list'] = 1
|
|
||||||
item['from_birth_list'] = 0
|
|
||||||
item['from_ethnic_list'] = 0
|
|
||||||
item['from_movie_list'] = 0
|
|
||||||
yield item
|
|
||||||
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
|
||||||
|
|
||||||
def parse_birth_page(self, response):
|
def parse_birth_page(self, response):
|
||||||
month = response.meta['month']
|
month = response.meta['month']
|
||||||
day = response.meta['day']
|
day = response.meta['day']
|
||||||
datarows = response.css('div.col-sm-12.col-lg-9')
|
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
|
||||||
if datarows:
|
if data:
|
||||||
rows = datarows[0].css('div.col-sm-4')
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
for row in rows:
|
else:
|
||||||
link_tag = row.css('a')
|
self.logger.warning(f"parse data error. {response.url}")
|
||||||
person = link_tag.css('::text').get().strip() if link_tag else ''
|
|
||||||
href = self.host_url + link_tag.attrib['href'] if link_tag else ''
|
|
||||||
|
|
||||||
item = IAFDPersonItem()
|
item = IAFDPersonDetailItem()
|
||||||
item['name'] = person
|
#yield item
|
||||||
item['href'] = href
|
|
||||||
item['from_astro_list'] = 0
|
|
||||||
item['from_birth_list'] = 1
|
|
||||||
item['from_ethnic_list'] = 0
|
|
||||||
item['from_movie_list'] = 0
|
|
||||||
yield item
|
|
||||||
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
|
||||||
|
|
||||||
def parse_ethnic_list_page(self, response):
|
def parse_ethnic_list_page(self, response):
|
||||||
div_root = response.css('select#ethnicity1')
|
div_root = response.css('select#ethnicity1')
|
||||||
@ -167,40 +150,25 @@ class IAFDSpider(BaseSpider):
|
|||||||
href = option.attrib.get('value')
|
href = option.attrib.get('value')
|
||||||
text = option.css('::text').get().strip()
|
text = option.css('::text').get().strip()
|
||||||
if href and href.lower() != 'none':
|
if href and href.lower() != 'none':
|
||||||
ethnic_url = self.host_url + href
|
ethnic_url = urljoin(response.url , href)
|
||||||
|
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
|
||||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||||
if self.debug:
|
|
||||||
break
|
|
||||||
|
|
||||||
def parse_ethnic_page(self, response):
|
def parse_ethnic_page(self, response):
|
||||||
ethnic = response.meta['ethnic']
|
ethnic = response.meta['ethnic']
|
||||||
rows = response.css('div.row.headshotrow')
|
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
||||||
for row in rows:
|
if data:
|
||||||
cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
for col in cols:
|
|
||||||
link_tag = col.css('a')
|
|
||||||
img_tag = col.css('div.pictag')
|
|
||||||
if link_tag and img_tag:
|
|
||||||
href = self.host_url + link_tag.attrib['href']
|
|
||||||
person = img_tag.css('::text').get().strip()
|
|
||||||
|
|
||||||
item = IAFDPersonItem()
|
|
||||||
item['name'] = person
|
|
||||||
item['href'] = href
|
|
||||||
item['from_astro_list'] = 0
|
|
||||||
item['from_birth_list'] = 0
|
|
||||||
item['from_ethnic_list'] = 1
|
|
||||||
item['from_movie_list'] = 0
|
|
||||||
yield item
|
|
||||||
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
|
||||||
|
|
||||||
next_page = response.css('a[rel="next"]')
|
|
||||||
if next_page:
|
|
||||||
next_url = self.host_url + next_page.attrib['href']
|
|
||||||
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
|
|
||||||
else:
|
else:
|
||||||
self.crawler.stats.inc_value(f"{self.name}/ethnic_done")
|
self.logger.warning(f"parse data error. {response.url}")
|
||||||
self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}")
|
|
||||||
|
if next_url:
|
||||||
|
self.logger.info(f"find next page: {next_url}")
|
||||||
|
else:
|
||||||
|
self.logger.info(f"found all pages. url: {response.url}")
|
||||||
|
|
||||||
|
item = IAFDPersonDetailItem()
|
||||||
|
#yield item
|
||||||
|
|
||||||
def parse_distributors_list_page(self, response):
|
def parse_distributors_list_page(self, response):
|
||||||
select_element = response.css('select[name="Distrib"]')
|
select_element = response.css('select[name="Distrib"]')
|
||||||
@ -209,16 +177,8 @@ class IAFDSpider(BaseSpider):
|
|||||||
for option in options:
|
for option in options:
|
||||||
value = option.attrib.get('value')
|
value = option.attrib.get('value')
|
||||||
text = option.css('::text').get().strip()
|
text = option.css('::text').get().strip()
|
||||||
dis_url = self.host_url + f"/distrib.rme/distrib={value}"
|
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
|
||||||
item = IAFDMovieItem()
|
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||||||
item['title'] = text
|
|
||||||
item['href'] = dis_url
|
|
||||||
item['release_year'] = 0
|
|
||||||
item['from_performer_list'] = 0
|
|
||||||
item['from_dist_list'] = 1
|
|
||||||
item['from_stu_list'] = 0
|
|
||||||
yield item
|
|
||||||
#yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
|
|
||||||
|
|
||||||
def parse_studios_list_page(self, response):
|
def parse_studios_list_page(self, response):
|
||||||
select_element = response.css('select[name="Studio"]')
|
select_element = response.css('select[name="Studio"]')
|
||||||
@ -227,47 +187,54 @@ class IAFDSpider(BaseSpider):
|
|||||||
for option in options:
|
for option in options:
|
||||||
value = option.attrib.get('value')
|
value = option.attrib.get('value')
|
||||||
text = option.css('::text').get().strip()
|
text = option.css('::text').get().strip()
|
||||||
stu_url = self.host_url + f"/studio.rme/studio={value}"
|
dis_url = f"{self.host_url}/studio.rme/studio={value}"
|
||||||
item = IAFDMovieItem()
|
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||||||
item['title'] = text
|
|
||||||
item['href'] = stu_url
|
def parse_stu_dist_page(self, response):
|
||||||
item['release_year'] = 0
|
list_type = response.meta.get('list_type', '')
|
||||||
item['from_performer_list'] = 0
|
data, next_url = common_parser(html=response.text, page=list_type)
|
||||||
item['from_dist_list'] = 0
|
if data:
|
||||||
item['from_stu_list'] = 1
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
yield item
|
else:
|
||||||
#yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
|
self.logger.warning(f"fetched data error. {response.url}")
|
||||||
|
|
||||||
|
item = IAFDPersonDetailItem()
|
||||||
|
#yield item
|
||||||
|
|
||||||
|
|
||||||
def parse_person_detail_page(self, response):
|
def parse_person_detail_page(self, response):
|
||||||
|
data = common_parser(html=response.text, page='actor', url=response.url)
|
||||||
|
if data:
|
||||||
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"fetched data error. {response.url}")
|
||||||
|
|
||||||
item = IAFDPersonDetailItem()
|
item = IAFDPersonDetailItem()
|
||||||
item['href'] = response.url
|
#yield item
|
||||||
item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中
|
|
||||||
# 解析其他详细信息,根据实际页面结构修改
|
|
||||||
item['gender'] = response.css('span.gender::text').get()
|
|
||||||
item['birthday'] = response.css('span.birthday::text').get()
|
|
||||||
item['astrology'] = response.css('span.astrology::text').get()
|
|
||||||
item['birthplace'] = response.css('span.birthplace::text').get()
|
|
||||||
item['years_active'] = response.css('span.years_active::text').get()
|
|
||||||
item['ethnicity'] = response.css('span.ethnicity::text').get()
|
|
||||||
item['nationality'] = response.css('span.nationality::text').get()
|
|
||||||
item['hair_colors'] = response.css('span.hair_colors::text').get()
|
|
||||||
item['eye_color'] = response.css('span.eye_color::text').get()
|
|
||||||
item['height'] = response.css('span.height::text').get()
|
|
||||||
item['weight'] = response.css('span.weight::text').get()
|
|
||||||
item['measurements'] = response.css('span.measurements::text').get()
|
|
||||||
item['tattoos'] = response.css('span.tattoos::text').get()
|
|
||||||
item['piercings'] = response.css('span.piercings::text').get()
|
|
||||||
item['movies_cnt'] = response.css('span.movies_cnt::text').get()
|
|
||||||
item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
|
|
||||||
item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
|
|
||||||
item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
|
|
||||||
item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
|
|
||||||
item['performer_aka'] = response.css('span.performer_aka::text').getall()
|
|
||||||
yield item
|
|
||||||
|
|
||||||
def parse_movie_detail_page(self, response):
|
def parse_movie_detail_page(self, response):
|
||||||
|
title = response.meta.get('title', '')
|
||||||
|
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
|
||||||
|
if data:
|
||||||
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"fetched data error. {response.url}")
|
||||||
|
|
||||||
item = IAFDMovieDetailItem()
|
item = IAFDMovieDetailItem()
|
||||||
item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中
|
#yield item
|
||||||
item['href'] = response.url
|
|
||||||
# 解析其他详细信息,根据实际页面结构修改
|
def custom_block_check(self, response):
|
||||||
yield item
|
item_type = response.meta.get('item_type', '')
|
||||||
|
if "invalid or outdated page" in response.text.lower():
|
||||||
|
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
|
||||||
|
return "invalid or outdated page"
|
||||||
|
else:
|
||||||
|
self.logger.info(f"right content. url: {response.url}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 处理页面异常,主要是404, 403
|
||||||
|
def handle_blocked(self, response, reason):
|
||||||
|
item_type = response.meta.get('item_type', '')
|
||||||
|
if response.status in [404, 403]:
|
||||||
|
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
|
||||||
@ -111,7 +111,7 @@ class JavhdSpider(BaseSpider):
|
|||||||
item['rank'] = rank
|
item['rank'] = rank
|
||||||
item['url'] = url
|
item['url'] = url
|
||||||
item[f'{lang}_name'] = name
|
item[f'{lang}_name'] = name
|
||||||
#TODO: 非英语的页面,要去更新对应的名字
|
# 非英语的页面,要去更新对应的名字
|
||||||
if lang != 'en':
|
if lang != 'en':
|
||||||
item['url'] = replace_lang_param(item['url'])
|
item['url'] = replace_lang_param(item['url'])
|
||||||
yield item
|
yield item
|
||||||
@ -127,7 +127,7 @@ class JavhdSpider(BaseSpider):
|
|||||||
meta={"list_item": item} # 传递列表页数据到详情页
|
meta={"list_item": item} # 传递列表页数据到详情页
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.info(f"actor(name) has full data. skip. url: {url}")
|
self.logger.info(f"actor({name}) has full data. skip. url: {url}")
|
||||||
|
|
||||||
# 获取下一页
|
# 获取下一页
|
||||||
next_path = data.get("pagination_params", {}).get("next")
|
next_path = data.get("pagination_params", {}).get("next")
|
||||||
|
|||||||
399
scrapy_proj/scrapy_proj/spiders/lord_spider.py
Normal file
399
scrapy_proj/scrapy_proj/spiders/lord_spider.py
Normal file
@ -0,0 +1,399 @@
|
|||||||
|
import scrapy
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
from urllib.parse import urljoin, quote_plus
|
||||||
|
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param, pretty_json_simple
|
||||||
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||||
|
from scrapy_proj.items import LordActorItem
|
||||||
|
from scrapy_proj.comm.comm_def import SPIDER_NAME_LORD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL
|
||||||
|
from scrapy_proj.db_wapper.spider_db_handler import LordDBHandler
|
||||||
|
|
||||||
|
db_tools = LordDBHandler()
|
||||||
|
|
||||||
|
class LordSpider(BaseSpider):
|
||||||
|
name = SPIDER_NAME_LORD
|
||||||
|
allowed_domains = ["www.thelordofporn.com", "thelordofporn.com"]
|
||||||
|
|
||||||
|
# 配置请求头(复用curl中的头部信息)
|
||||||
|
custom_settings = {
|
||||||
|
"DEFAULT_REQUEST_HEADERS": {
|
||||||
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||||
|
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||||||
|
"if-modified-since": "Wed, 23 Jul 2025 14:34:28 GMT",
|
||||||
|
"priority": "u=0, i",
|
||||||
|
"sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
|
||||||
|
"sec-ch-ua-mobile": "?0",
|
||||||
|
"sec-ch-ua-platform": "\"macOS\"",
|
||||||
|
"sec-fetch-dest": "document",
|
||||||
|
"sec-fetch-mode": "navigate",
|
||||||
|
"sec-fetch-site": "none",
|
||||||
|
"sec-fetch-user": "?1",
|
||||||
|
"upgrade-insecure-requests": "1",
|
||||||
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0"
|
||||||
|
},
|
||||||
|
"COOKIES_ENABLED": True # 启用Cookie支持
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, debug='false', mod='update', *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||||
|
self.update_mod = False if mod and mod.lower() == 'force' else True
|
||||||
|
|
||||||
|
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||||||
|
|
||||||
|
# 入口函数,由基类的方法触发
|
||||||
|
def custom_start_requests(self):
|
||||||
|
url = 'https://thelordofporn.com/pornstars/'
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=url,
|
||||||
|
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||||||
|
callback=self.parse_list,
|
||||||
|
meta={} # 传递列表页数据到详情页
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_list(self, response):
|
||||||
|
# 提取所有演员条目(对应原代码中的article.loop-item)
|
||||||
|
articles = response.css("article.loop-item")
|
||||||
|
self.logger.info(f"当前页({response.url})找到 {len(articles)} 个演员条目")
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
try:
|
||||||
|
# 提取演员名称和详情页链接
|
||||||
|
title_tag = article.css("h3.loop-item__title a")
|
||||||
|
title = title_tag.css("::text").get(default="N/A").strip()
|
||||||
|
href = title_tag.attrib.get("href") # 获取a标签的href属性
|
||||||
|
|
||||||
|
# 提取评分
|
||||||
|
rating = article.css("div.loop-item__rating::text").get(default="N/A").strip()
|
||||||
|
|
||||||
|
# 提取排名和投票数(对应原代码中的meta_tags)
|
||||||
|
meta_tags = article.css("div.loop-item__rank span")
|
||||||
|
rank = None
|
||||||
|
votes = None
|
||||||
|
|
||||||
|
# 解析排名(第一个span中的b标签)
|
||||||
|
if len(meta_tags) >= 1:
|
||||||
|
rank_b = meta_tags[0].css("b::text").get()
|
||||||
|
rank = rank_b.strip() if rank_b else "N/A"
|
||||||
|
|
||||||
|
# 解析投票数(第二个span中的b标签)
|
||||||
|
if len(meta_tags) >= 2:
|
||||||
|
votes_b = meta_tags[1].css("b::text").get()
|
||||||
|
votes = votes_b.strip() if votes_b else "N/A"
|
||||||
|
|
||||||
|
# 转换为数值类型(模拟原代码中的utils.parse_numeric)
|
||||||
|
def parse_numeric(value):
|
||||||
|
if not value or value == "N/A":
|
||||||
|
return None
|
||||||
|
# 移除非数字字符(如逗号、%等)
|
||||||
|
numeric_str = ''.join(filter(str.isdigit, value))
|
||||||
|
return int(numeric_str) if numeric_str else None
|
||||||
|
|
||||||
|
# 构建演员数据字典
|
||||||
|
actress_data = {
|
||||||
|
"pornstar": title,
|
||||||
|
"rating": parse_numeric(rating),
|
||||||
|
"rank": parse_numeric(rank),
|
||||||
|
"votes": parse_numeric(votes),
|
||||||
|
"href": href if href else None
|
||||||
|
}
|
||||||
|
# 发起详情查询
|
||||||
|
actor_exists = 0 if not self.update_mod else db_tools.has_full_data(href)
|
||||||
|
if actor_exists < 1 :
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=href,
|
||||||
|
callback=self.parse_actor_detail,
|
||||||
|
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
|
||||||
|
meta = {'actor':actress_data}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.info(f"actor({title}) has full data. skip. url: {href}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"解析演员条目失败: {e}, 页面: {response.url}")
|
||||||
|
continue # 跳过错误条目,继续解析下一个
|
||||||
|
|
||||||
|
# 提取下一页链接(对应原代码中的.next.page-numbers)
|
||||||
|
next_page_url = None
|
||||||
|
next_page_tag = response.css(".nav-links .next.page-numbers")
|
||||||
|
if next_page_tag:
|
||||||
|
next_page_href = next_page_tag.attrib.get("href")
|
||||||
|
if next_page_href and not self.debug:
|
||||||
|
# 拼接完整URL(处理相对路径)
|
||||||
|
next_page_url = urljoin(response.url, next_page_href)
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=next_page_url,
|
||||||
|
callback=self.parse_list,
|
||||||
|
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
|
||||||
|
meta = {}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.info(f"已解析所有页面, current url: {response.url}")
|
||||||
|
|
||||||
|
def parse_actor_detail(self, response):
|
||||||
|
# 1. 定义字段映射表:页面原始字段 -> Item字段
|
||||||
|
FIELD_MAPPING = {
|
||||||
|
# 基本信息
|
||||||
|
'date_modified': 'date_modified',
|
||||||
|
# 排名信息
|
||||||
|
'Global Rank': 'global_rank',
|
||||||
|
'Weekly Rank': 'weekly_rank',
|
||||||
|
# 评分信息
|
||||||
|
'Last Month': 'last_month_rating',
|
||||||
|
'Rating Av.': 'current_rating',
|
||||||
|
'Total of Votes': 'total_votes',
|
||||||
|
# 详细属性
|
||||||
|
'Career start': 'career_start',
|
||||||
|
'Measurements': 'measurements',
|
||||||
|
'Born': 'born',
|
||||||
|
'Height': 'height',
|
||||||
|
'Weight': 'weight',
|
||||||
|
'Name': 'alias_raw', # 别名对应Name字段
|
||||||
|
# 解析后字段(出生/身高/体重)
|
||||||
|
'birth_date': 'birth_date',
|
||||||
|
'birth_year': 'birth_year',
|
||||||
|
'birth_place': 'birth_place',
|
||||||
|
'height_ft': 'height_ft',
|
||||||
|
'height_cm': 'height_cm',
|
||||||
|
'weight_lbs': 'weight_lbs',
|
||||||
|
'weight_kg': 'weight_kg',
|
||||||
|
'alias':'alias'
|
||||||
|
}
|
||||||
|
|
||||||
|
# 2. 初始化原始数据容器
|
||||||
|
raw_data = {}
|
||||||
|
# 3. 提取基础信息
|
||||||
|
raw_data['href'] = response.url
|
||||||
|
entry_header = response.css("header.entry-header")
|
||||||
|
raw_data['name'] = entry_header.css("h1.entry-title::text").get(default="").strip()
|
||||||
|
raw_data['date_modified'] = entry_header.css("time[itemprop='dateModified']::attr(content)").get(default="").strip()
|
||||||
|
|
||||||
|
# 4. 提取排名信息
|
||||||
|
for item in entry_header.css("div.porn-star-rank__item"):
|
||||||
|
item_text = item.css("::text").get(default="").strip()
|
||||||
|
raw_data[item_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
|
||||||
|
|
||||||
|
# 5. 提取评分和投票信息
|
||||||
|
for item in response.css("div.specifications__item--horizontal"):
|
||||||
|
# 1. 精准定位标题区域(排除b标签)
|
||||||
|
# 情况1:有子div的结构(如Rating Av.带img)
|
||||||
|
title_div = item.css("div:first-child")
|
||||||
|
if title_div:
|
||||||
|
# 只提取子div内的文本(自动排除同级的b标签)
|
||||||
|
title_parts = title_div.css("::text").getall()
|
||||||
|
else:
|
||||||
|
# 情况2和3:无子div的结构(Last Month和Total of Votes)
|
||||||
|
# 提取当前item内所有文本,但排除b标签的内容
|
||||||
|
all_text_parts = item.css("::text").getall()
|
||||||
|
b_text_parts = item.css("b::text").getall()
|
||||||
|
# 从所有文本中移除b标签的文本
|
||||||
|
title_parts = [t for t in all_text_parts if t not in b_text_parts]
|
||||||
|
|
||||||
|
# 2. 清理标题文本(处理非断空格和空白)
|
||||||
|
title_text = "".join(title_parts)
|
||||||
|
title_text = title_text.replace(u'\xa0', u' ') # 替换非断空格
|
||||||
|
title_text = re.sub(r'\s+', ' ', title_text).strip() # 合并空白
|
||||||
|
|
||||||
|
raw_data[title_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
|
||||||
|
|
||||||
|
# 6. 提取详细属性(specifications-grid-row)
|
||||||
|
for row in response.css("div.specifications-grid-row"):
|
||||||
|
items = row.css("div.specifications-grid-item")
|
||||||
|
for i in [0, 1]: # 处理每行2个属性
|
||||||
|
if i < len(items):
|
||||||
|
label = extract_text_from_element(items[i].css("h5"))
|
||||||
|
value = extract_text_from_element(items[i].css("span"))
|
||||||
|
if label:
|
||||||
|
raw_data[label] = value
|
||||||
|
|
||||||
|
# 7. 处理特殊字段(别名需要清洗)
|
||||||
|
raw_data['alias'] = self.clean_alias(raw_data.get("Name", ""))
|
||||||
|
|
||||||
|
# 9. 解析出生信息、身高、体重并合并
|
||||||
|
raw_data.update(self.parse_birth_info(raw_data.get("Born", "")))
|
||||||
|
raw_data.update(self.parse_height(raw_data.get("Height", "")))
|
||||||
|
raw_data.update(self.parse_weight(raw_data.get("Weight", "")))
|
||||||
|
|
||||||
|
# 10. 映射到Item并返回
|
||||||
|
item = LordActorItem()
|
||||||
|
item['item_type'] = ITEM_TYPE_ACTOR_DETAIL
|
||||||
|
actor_data = response.meta['actor']
|
||||||
|
for k, v in actor_data.items():
|
||||||
|
if k in item.fields:
|
||||||
|
item[k] = v
|
||||||
|
|
||||||
|
for raw_field, item_field in FIELD_MAPPING.items():
|
||||||
|
if item_field in item.fields:
|
||||||
|
item[item_field] = raw_data.get(raw_field, "")
|
||||||
|
|
||||||
|
# 标记为完整数据
|
||||||
|
item['is_full_data'] = 1
|
||||||
|
self.logger.info(f"actor data: {raw_data}, meta: {response.meta['actor']}, item: {pretty_json_simple(item)}")
|
||||||
|
|
||||||
|
yield item
|
||||||
|
|
||||||
|
# 保留原工具函数(需作为Spider类的方法)
|
||||||
|
def parse_birth_info(self, text):
|
||||||
|
match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
return {
|
||||||
|
"birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
|
||||||
|
"birth_year": match.group(3),
|
||||||
|
"birth_place": match.group(4),
|
||||||
|
}
|
||||||
|
return {"birth_date": text, "birth_year": "", "birth_place": ""}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_height2(self, text):
|
||||||
|
match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
height_ft = f"{match.group(1)}'{match.group(2)}\""
|
||||||
|
return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
|
||||||
|
return {"height_ft": text, "height_cm": ""}
|
||||||
|
def parse_height(self, text):
|
||||||
|
# 统一预处理:替换逗号为小数点,处理常见笔误(如'n'→'in')
|
||||||
|
text = text.replace(',', '.').replace(' n ', ' in ').strip()
|
||||||
|
|
||||||
|
# 正则表达式:匹配所有英尺+英寸格式(支持多种表达方式)
|
||||||
|
# 分组说明:
|
||||||
|
# 1. 英尺数值 2. 英尺单位(feet/ft/ft./') 3. 英寸数值 4. 英寸单位(inches/in/in./inch/")
|
||||||
|
# 5. 厘米/米数值 6. 单位(cm/m)
|
||||||
|
pattern = r"""
|
||||||
|
# 情况1:先英尺英寸,后厘米/米(主流格式)
|
||||||
|
(?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分(如5ft/5')
|
||||||
|
(?:and\s*)? # 可选的"and"(如5 feet and 2 inches)
|
||||||
|
(\d+)\s*(inches|in\.?|inch|")?\s* # 英寸部分(如2in/2")
|
||||||
|
(?:\(?(\d+\.?\d*)\s*(cm|m)\)?) # 厘米/米部分(如(157cm)/(1.57m))
|
||||||
|
|
||||||
|
| # 或
|
||||||
|
|
||||||
|
# 情况2:先厘米,后英尺英寸(如170 cm / 5 feet and 7 inches)
|
||||||
|
(\d+)\s*cm\s*/\s* # 厘米在前
|
||||||
|
(?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分
|
||||||
|
(?:and\s*)?
|
||||||
|
(\d+)\s*(inches|in\.?|inch|")? # 英寸部分
|
||||||
|
|
||||||
|
| # 或
|
||||||
|
|
||||||
|
# 情况3:纯简写格式(如5'3" (160 cm))
|
||||||
|
(\d+)'(\d+)"\s*\(?(\d+)\s*cm\)? # 5'3"格式
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 使用VERBOSE忽略正则中的空格,IGNORECASE忽略大小写
|
||||||
|
match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
|
||||||
|
if not match:
|
||||||
|
# 处理纯厘米格式(如"160cm")
|
||||||
|
cm_match = re.match(r'(\d+)\s*cm', text, re.IGNORECASE)
|
||||||
|
if cm_match:
|
||||||
|
return {"height_ft": "", "height_cm": cm_match.group(1)}
|
||||||
|
return {"height_ft": text, "height_cm": ""}
|
||||||
|
|
||||||
|
# 提取匹配结果(根据不同情况处理分组)
|
||||||
|
ft = None
|
||||||
|
inch = None
|
||||||
|
cm = None
|
||||||
|
|
||||||
|
# 情况1:先英尺英寸后厘米/米
|
||||||
|
if match.group(1) and match.group(3):
|
||||||
|
ft = match.group(1)
|
||||||
|
inch = match.group(3)
|
||||||
|
num = match.group(5)
|
||||||
|
unit = match.group(6).lower() if match.group(6) else 'cm'
|
||||||
|
|
||||||
|
# 情况2:先厘米后英尺英寸
|
||||||
|
elif match.group(7):
|
||||||
|
cm = match.group(7)
|
||||||
|
ft = match.group(8)
|
||||||
|
inch = match.group(10)
|
||||||
|
unit = 'cm' # 情况2中前面的单位固定为cm
|
||||||
|
|
||||||
|
# 情况3:纯简写格式(5'3")
|
||||||
|
elif match.group(11) and match.group(12):
|
||||||
|
ft = match.group(11)
|
||||||
|
inch = match.group(12)
|
||||||
|
cm = match.group(13)
|
||||||
|
unit = 'cm'
|
||||||
|
|
||||||
|
# 处理厘米/米转换(米转厘米)
|
||||||
|
if not cm and num and unit:
|
||||||
|
if unit == 'm':
|
||||||
|
cm = str(int(float(num) * 100)) # 1.57m → 157cm
|
||||||
|
else:
|
||||||
|
cm = num # 直接使用cm数值
|
||||||
|
|
||||||
|
# 格式化英尺英寸表达式(如5'2")
|
||||||
|
height_ft = f"{ft}'{inch}\"" if ft and inch else ""
|
||||||
|
|
||||||
|
return {"height_ft": height_ft.strip(), "height_cm": cm.strip() if cm else ""}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_weight2(self, text):
|
||||||
|
match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
|
||||||
|
return {"weight_lbs": text, "weight_kg": ""}
|
||||||
|
|
||||||
|
def parse_weight(self, text):
|
||||||
|
# 预处理:清理空格和常见格式问题
|
||||||
|
text = text.strip().replace(' ', ' ')
|
||||||
|
|
||||||
|
# 正则表达式:匹配多种体重格式
|
||||||
|
# 分组说明:
|
||||||
|
# 1. 磅数值 2. 磅单位(lb/lbs/pounds) 3. 千克数值 4. 千克单位(kg)
|
||||||
|
# 5. 千克在前的数值 6. 千克单位 7. 磅在后的数值 8. 磅单位
|
||||||
|
pattern = r"""
|
||||||
|
# 情况1:磅在前,千克在后(主流格式)
|
||||||
|
(?:(\d+)\s*(lb|lbs|pounds)?\s*) # 磅部分(支持lb/lbs/pounds或省略单位)
|
||||||
|
(?:\(?\s*(\d+)\s*(kg)\s*\)?) # 千克部分(如(45 kg))
|
||||||
|
|
||||||
|
| # 或
|
||||||
|
|
||||||
|
# 情况2:千克在前,磅在后(如52 kg / 114 lbs)
|
||||||
|
(?:(\d+)\s*(kg)\s*/\s*) # 千克部分
|
||||||
|
(\d+)\s*(lb|lbs|pounds)? # 磅部分
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 使用VERBOSE和IGNORECASE标志增强兼容性
|
||||||
|
match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
|
||||||
|
if not match:
|
||||||
|
# 尝试匹配纯千克格式(如"52kg")
|
||||||
|
kg_match = re.match(r'(\d+)\s*kg', text, re.IGNORECASE)
|
||||||
|
if kg_match:
|
||||||
|
return {"weight_lbs": "", "weight_kg": kg_match.group(1)}
|
||||||
|
|
||||||
|
# 尝试匹配纯磅格式(如"114lb")
|
||||||
|
lb_match = re.match(r'(\d+)\s*(lb|lbs|pounds)', text, re.IGNORECASE)
|
||||||
|
if lb_match:
|
||||||
|
return {"weight_lbs": lb_match.group(1), "weight_kg": ""}
|
||||||
|
|
||||||
|
# 完全无法解析的情况
|
||||||
|
return {"weight_lbs": text, "weight_kg": ""}
|
||||||
|
|
||||||
|
# 提取匹配结果(根据不同情况处理分组)
|
||||||
|
weight_lbs = None
|
||||||
|
weight_kg = None
|
||||||
|
|
||||||
|
# 情况1:磅在前,千克在后
|
||||||
|
if match.group(1) and match.group(3):
|
||||||
|
weight_lbs = match.group(1)
|
||||||
|
weight_kg = match.group(3)
|
||||||
|
|
||||||
|
# 情况2:千克在前,磅在后
|
||||||
|
elif match.group(5) and match.group(6):
|
||||||
|
weight_kg = match.group(5)
|
||||||
|
weight_lbs = match.group(7)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"weight_lbs": weight_lbs.strip() if weight_lbs else "",
|
||||||
|
"weight_kg": weight_kg.strip() if weight_kg else ""
|
||||||
|
}
|
||||||
|
|
||||||
|
def clean_alias(self, alias):
|
||||||
|
alias = re.sub(r'\(Age \d+\)', '', alias, re.IGNORECASE)
|
||||||
|
return [name.strip() for name in alias.split(',') if name.strip()]
|
||||||
|
|
||||||
|
def parse_numeric(self, value):
|
||||||
|
try:
|
||||||
|
return float(value)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return 0
|
||||||
636
scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
Normal file
636
scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
Normal file
@ -0,0 +1,636 @@
|
|||||||
|
|
||||||
|
import cloudscraper
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from requests.exceptions import RequestException
|
||||||
|
from functools import partial
|
||||||
|
#import config
|
||||||
|
#import utils
|
||||||
|
|
||||||
|
# 定义基础 URL 和可变参数
|
||||||
|
host_url = "https://www.iafd.com"
|
||||||
|
|
||||||
|
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||||||
|
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||||||
|
|
||||||
|
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||||||
|
|
||||||
|
distributors_list_url = f'{host_url}/distrib.asp'
|
||||||
|
distributors_base_url = f"{host_url}/distrib.rme/distrib="
|
||||||
|
|
||||||
|
studios_list_url = f"{host_url}/studio.asp"
|
||||||
|
studios_base_url = f"{host_url}/studio.rme/studio="
|
||||||
|
|
||||||
|
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||||||
|
|
||||||
|
# 设置 headers 和 scraper
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
|
http_code_404 = 404
|
||||||
|
http_code_login = 401
|
||||||
|
http_code_url = 601
|
||||||
|
http_code_local = 99
|
||||||
|
|
||||||
|
save_raw_html = True
|
||||||
|
load_from_local = False
|
||||||
|
|
||||||
|
def common_parser(html, page, **kwargs):
|
||||||
|
parser = "lxml" if page=='ethnic' else "html.parser"
|
||||||
|
soup = BeautifulSoup(html, parser)
|
||||||
|
if not soup:
|
||||||
|
return None
|
||||||
|
if page == 'astro':
|
||||||
|
#parse_page_astro(soup, astro):
|
||||||
|
return parse_page_astro(soup, **kwargs)
|
||||||
|
elif page == 'birth':
|
||||||
|
#parse_page_birth(soup, month, day):
|
||||||
|
return parse_page_birth(soup, **kwargs)
|
||||||
|
elif page == 'ethnic':
|
||||||
|
#parse_page_ethnic(soup, ethnic):
|
||||||
|
return parse_page_ethnic(soup, **kwargs)
|
||||||
|
elif page == 'dist':
|
||||||
|
return parse_page_dist_stu(soup,'distable')
|
||||||
|
elif page == 'stu':
|
||||||
|
return parse_page_dist_stu(soup,'studio')
|
||||||
|
elif page == 'actor':
|
||||||
|
#parse_page_performer(soup, url):
|
||||||
|
return parse_page_performer(soup, **kwargs)
|
||||||
|
elif page == 'movies':
|
||||||
|
#parse_page_movie(soup, href, title)
|
||||||
|
return parse_page_movie(soup, **kwargs)
|
||||||
|
else:
|
||||||
|
logging.warning(f"wrong page: {page}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
'''
|
||||||
|
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||||
|
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||||
|
if load_from_local: # 从本地读取的逻辑
|
||||||
|
html = utils.read_raw_html(url)
|
||||||
|
if html:
|
||||||
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
|
html_text = preprocessor(html) if preprocessor else html
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_text, parser)
|
||||||
|
if validator(soup): # 进行自定义页面检查
|
||||||
|
return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
if host_url not in url.lower():
|
||||||
|
logging.error(f'wrong url format: {url}')
|
||||||
|
return None, http_code_url
|
||||||
|
|
||||||
|
response = scraper.get(url, headers=headers)
|
||||||
|
|
||||||
|
# 处理 HTTP 状态码
|
||||||
|
if response.status_code == 404:
|
||||||
|
logging.debug(f"Page not found (404): {url}")
|
||||||
|
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
|
# 过期的网页,与404相同处理
|
||||||
|
if "invalid or outdated page" in response.text.lower():
|
||||||
|
logging.debug(f"invalid or outdated page: {url}")
|
||||||
|
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
|
if save_raw_html:
|
||||||
|
utils.write_raw_html(url, response.text)
|
||||||
|
|
||||||
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
|
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_text, parser)
|
||||||
|
if validator(soup): # 进行自定义页面检查
|
||||||
|
return soup, response.status_code
|
||||||
|
else:
|
||||||
|
# 检查是否发生跳转,比如到登录页面
|
||||||
|
if response.history:
|
||||||
|
logging.warning(f"Page redirected on {url}. Validation failed.")
|
||||||
|
return None, http_code_login
|
||||||
|
|
||||||
|
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||||
|
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||||
|
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||||||
|
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||||||
|
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||||
|
|
||||||
|
logging.error(f'Fetching failed after max retries. {url}')
|
||||||
|
return None, None # 达到最大重试次数仍然失败
|
||||||
|
'''
|
||||||
|
|
||||||
|
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||||
|
def preprocess_html(html):
|
||||||
|
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||||||
|
|
||||||
|
# 通用的 HTML 结构验证器
|
||||||
|
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||||
|
if attr_type == "id":
|
||||||
|
return soup.find(tag, id=identifier) is not None
|
||||||
|
elif attr_type == "class":
|
||||||
|
return bool(soup.find_all(tag, class_=identifier))
|
||||||
|
elif attr_type == "name":
|
||||||
|
return bool(soup.find('select', {'name': identifier}))
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 检查电影信息是否存在
|
||||||
|
def movie_validator(soup, table_id):
|
||||||
|
return soup.find("table", id=table_id) is not None
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_page_ethnic_list(soup, href):
|
||||||
|
div_root = soup.find("select", id="ethnicity1")
|
||||||
|
if not div_root:
|
||||||
|
logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
|
||||||
|
# 提取所有的 <option> 标签
|
||||||
|
options = div_root.find_all('option')
|
||||||
|
if options:
|
||||||
|
# 解析并输出 value 和文本内容
|
||||||
|
for option in options:
|
||||||
|
href = option.get('value', None)
|
||||||
|
text = option.text.strip()
|
||||||
|
if href and href.lower() == 'none':
|
||||||
|
continue
|
||||||
|
list_data.append({
|
||||||
|
"name": text,
|
||||||
|
"href": host_url + href if href else ''
|
||||||
|
})
|
||||||
|
return list_data
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_page_astro(soup, astro):
|
||||||
|
astro_div = soup.find("div", id="astro")
|
||||||
|
if not astro_div:
|
||||||
|
logging.warning(f"Warning: No 'astro' div found in {astro}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
flag = False
|
||||||
|
list_cnt = 0
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
|
||||||
|
birth_date = None
|
||||||
|
for elem in astro_div.find_all(recursive=False):
|
||||||
|
if elem.name == "h3" and "astroday" in elem.get("class", []):
|
||||||
|
birth_date = elem.get_text(strip=True)
|
||||||
|
elif elem.name == "div" and "perficon" in elem.get("class", []):
|
||||||
|
a_tag = elem.find("a")
|
||||||
|
if a_tag:
|
||||||
|
href = host_url + a_tag["href"]
|
||||||
|
name = a_tag.find("span", class_="perfname")
|
||||||
|
if name:
|
||||||
|
list_data.append({
|
||||||
|
"astrology": astro,
|
||||||
|
"birth_date": birth_date,
|
||||||
|
"person": name.get_text(strip=True),
|
||||||
|
"href": href
|
||||||
|
})
|
||||||
|
flag = True
|
||||||
|
list_cnt = list_cnt +1
|
||||||
|
if flag:
|
||||||
|
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||||
|
return list_data, next_url
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# 解析页面内容并更新birth_map
|
||||||
|
def parse_page_birth(soup, month, day):
|
||||||
|
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
|
||||||
|
if not datarows:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
flag = False
|
||||||
|
list_cnt = 0
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
rows = datarows[0].find_all('div', class_='col-sm-4')
|
||||||
|
for row in rows:
|
||||||
|
link_tag = row.find('a')
|
||||||
|
person = link_tag.text.strip() if link_tag else ''
|
||||||
|
href = link_tag['href'] if link_tag else ''
|
||||||
|
href = host_url + href
|
||||||
|
|
||||||
|
# 如果 href 已经在 birth_map 中,跳过
|
||||||
|
flag = True
|
||||||
|
if any(entry['href'] == href for entry in list_data):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 将数据添加到 birth_map
|
||||||
|
list_data.append({
|
||||||
|
'month': month,
|
||||||
|
'day': day,
|
||||||
|
'person': person,
|
||||||
|
'href': href
|
||||||
|
})
|
||||||
|
list_cnt = list_cnt +1
|
||||||
|
|
||||||
|
if flag:
|
||||||
|
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||||
|
return list_data, next_url
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_page_ethnic(soup, ethnic):
|
||||||
|
rows = soup.find_all('div', class_='row headshotrow')
|
||||||
|
flag = False
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
|
||||||
|
link_tag = col.find('a')
|
||||||
|
img_tag = col.find('div', class_='pictag')
|
||||||
|
flag = True
|
||||||
|
|
||||||
|
if link_tag and img_tag:
|
||||||
|
href = host_url + link_tag['href']
|
||||||
|
person = img_tag.text.strip()
|
||||||
|
|
||||||
|
# 将数据存储到 ethnic_map
|
||||||
|
list_data.append({
|
||||||
|
'ethnic': ethnic,
|
||||||
|
'person': person,
|
||||||
|
'href': href
|
||||||
|
})
|
||||||
|
if flag:
|
||||||
|
logging.debug(f"get {len(list_data)} persons from this page.")
|
||||||
|
|
||||||
|
next_page = soup.find('a', rel='next')
|
||||||
|
if next_page:
|
||||||
|
next_url = host_url + next_page['href']
|
||||||
|
logging.debug(f"Found next page: {next_url}")
|
||||||
|
return list_data, next_url
|
||||||
|
else:
|
||||||
|
logging.debug(f"All pages fetched for {ethnic}.")
|
||||||
|
return list_data, None
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析列表页
|
||||||
|
def parse_page_dist_stu_list(soup, select_name):
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
|
||||||
|
select_element = soup.find('select', {'name': select_name})
|
||||||
|
if select_element :
|
||||||
|
options = select_element.find_all('option')
|
||||||
|
for option in options:
|
||||||
|
value = option.get('value') # 获取 value 属性
|
||||||
|
text = option.text.strip() # 获取文本内容
|
||||||
|
list_data.append({
|
||||||
|
'name' : text,
|
||||||
|
'href' : str(value)
|
||||||
|
})
|
||||||
|
return list_data, next_url
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_page_dist_stu(soup, table_id):
|
||||||
|
table = soup.find("table", id=table_id)
|
||||||
|
if not table:
|
||||||
|
logging.warning(f"Warning: No {table_id} table found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 找到thead并跳过
|
||||||
|
thead = table.find('thead')
|
||||||
|
if thead:
|
||||||
|
thead.decompose() # 去掉thead部分,不需要解析
|
||||||
|
|
||||||
|
# 现在只剩下tbody部分
|
||||||
|
tbody = table.find('tbody')
|
||||||
|
rows = tbody.find_all('tr') if tbody else []
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
cols = row.find_all('td')
|
||||||
|
if len(cols) >= 5:
|
||||||
|
title = cols[0].text.strip()
|
||||||
|
label = cols[1].text.strip()
|
||||||
|
year = cols[2].text.strip()
|
||||||
|
rev = cols[3].text.strip()
|
||||||
|
a_href = cols[0].find('a')
|
||||||
|
href = host_url + a_href['href'] if a_href else ''
|
||||||
|
|
||||||
|
list_data.append({
|
||||||
|
'title': title,
|
||||||
|
'label': label,
|
||||||
|
'year': year,
|
||||||
|
'rev': rev,
|
||||||
|
'href': href
|
||||||
|
})
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 作品列表,有个人出演,也有导演的
|
||||||
|
def parse_credits_table(table, distributor_list):
|
||||||
|
# 找到thead并跳过
|
||||||
|
thead = table.find('thead')
|
||||||
|
if thead:
|
||||||
|
thead.decompose() # 去掉thead部分,不需要解析
|
||||||
|
|
||||||
|
# 现在只剩下tbody部分
|
||||||
|
tbody = table.find('tbody')
|
||||||
|
rows = tbody.find_all('tr') if tbody else []
|
||||||
|
|
||||||
|
movies = []
|
||||||
|
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||||
|
|
||||||
|
# rows = table.find_all('tr', class_='we')
|
||||||
|
for row in rows:
|
||||||
|
#tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串
|
||||||
|
tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串
|
||||||
|
cols = row.find_all('td')
|
||||||
|
if len(cols) >= 6:
|
||||||
|
title = cols[0].text.strip()
|
||||||
|
href_a = cols[0].find('a')
|
||||||
|
href = href_a['href'] if href_a else ''
|
||||||
|
year = cols[1].text.strip()
|
||||||
|
distributor = cols[2].text.strip().lower()
|
||||||
|
href_d = cols[2].find('a')
|
||||||
|
href_dist = host_url + href_d['href'] if href_d else ''
|
||||||
|
notes = cols[3].text.strip()
|
||||||
|
rev = cols[4].text.strip()
|
||||||
|
formats = cols[5].text.strip()
|
||||||
|
|
||||||
|
for key in distributor_list:
|
||||||
|
if key in distributor:
|
||||||
|
distributor_count[key] += 1
|
||||||
|
|
||||||
|
movies.append({
|
||||||
|
'title': title,
|
||||||
|
'href' : href,
|
||||||
|
'year': year,
|
||||||
|
'distributor': distributor,
|
||||||
|
'distributor_href': href_dist,
|
||||||
|
'notes': notes,
|
||||||
|
'rev': rev,
|
||||||
|
'formats': formats,
|
||||||
|
'tr_class': tr_class
|
||||||
|
})
|
||||||
|
return movies, distributor_count
|
||||||
|
|
||||||
|
|
||||||
|
# 请求网页并提取所需数据
|
||||||
|
def parse_page_performer(soup, url):
|
||||||
|
# 提取数据
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# 定义我们需要的字段名称和HTML中对应的标签
|
||||||
|
fields = {
|
||||||
|
'performer_aka': 'Performer AKA',
|
||||||
|
'birthday': 'Birthday',
|
||||||
|
'astrology': 'Astrology',
|
||||||
|
'birthplace': 'Birthplace',
|
||||||
|
'gender': 'Gender',
|
||||||
|
'years_active': 'Years Active',
|
||||||
|
'ethnicity': 'Ethnicity',
|
||||||
|
'nationality': 'Nationality',
|
||||||
|
'hair_colors': 'Hair Colors',
|
||||||
|
'eye_color': 'Eye Color',
|
||||||
|
'height': 'Height',
|
||||||
|
'weight': 'Weight',
|
||||||
|
'measurements': 'Measurements',
|
||||||
|
'tattoos': 'Tattoos',
|
||||||
|
'piercings': 'Piercings'
|
||||||
|
}
|
||||||
|
reversed_map = {v: k for k, v in fields.items()}
|
||||||
|
|
||||||
|
# 解析表格数据, 获取参演或者导演的列表
|
||||||
|
role_list = ['personal', 'directoral']
|
||||||
|
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
|
||||||
|
credits_list = {}
|
||||||
|
|
||||||
|
# 使用字典来存储统计
|
||||||
|
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||||
|
for role in role_list:
|
||||||
|
table = soup.find('table', id=role)
|
||||||
|
if table :
|
||||||
|
movies, stat_map = parse_credits_table(table, distributor_list)
|
||||||
|
credits_list[role] = movies
|
||||||
|
# 更新 distributor 统计
|
||||||
|
for distributor in distributor_list:
|
||||||
|
distributor_count[distributor] += stat_map.get(distributor, 0)
|
||||||
|
|
||||||
|
# 统计 movies 数量
|
||||||
|
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
|
||||||
|
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
|
||||||
|
|
||||||
|
# 如果没有找到
|
||||||
|
if len(credits_list) == 0 :
|
||||||
|
logging.warning(f"movie table empty. url: {url} ")
|
||||||
|
|
||||||
|
# 遍历每个 bioheading, 获取metadata
|
||||||
|
bioheadings = soup.find_all('p', class_='bioheading')
|
||||||
|
for bio in bioheadings:
|
||||||
|
heading = bio.text.strip()
|
||||||
|
biodata = None
|
||||||
|
|
||||||
|
# 如果包含 "Performer",需要特殊处理
|
||||||
|
if 'Performer' in heading:
|
||||||
|
heading = 'Performer AKA'
|
||||||
|
biodata_div = bio.find_next('div', class_='biodata')
|
||||||
|
if biodata_div:
|
||||||
|
div_text = biodata_div.get_text(separator='|').strip()
|
||||||
|
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
|
||||||
|
else:
|
||||||
|
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
|
||||||
|
|
||||||
|
# 保存数据
|
||||||
|
if heading in reversed_map:
|
||||||
|
kkey = reversed_map[heading]
|
||||||
|
data[kkey] = biodata
|
||||||
|
|
||||||
|
# 添加统计数据到 data
|
||||||
|
data['movies_cnt'] = movies_cnt
|
||||||
|
data['vixen_cnt'] = distributor_count['vixen']
|
||||||
|
data['blacked_cnt'] = distributor_count['blacked']
|
||||||
|
data['tushy_cnt'] = distributor_count['tushy']
|
||||||
|
data['x_art_cnt'] = distributor_count['x-art']
|
||||||
|
data['credits'] = credits_list
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 解析网页 HTML 并提取电影信息
|
||||||
|
def parse_page_movie(soup, href, title):
|
||||||
|
# 解析电影基础信息
|
||||||
|
movie_data = {}
|
||||||
|
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
|
||||||
|
if info_div:
|
||||||
|
labels = info_div.find_all("p", class_="bioheading")
|
||||||
|
values = info_div.find_all("p", class_="biodata")
|
||||||
|
for label, value in zip(labels, values):
|
||||||
|
key = label.text.strip()
|
||||||
|
if key == "Directors": # 解析多位导演的情况
|
||||||
|
directors = []
|
||||||
|
links = value.find_all("a")
|
||||||
|
for link in links:
|
||||||
|
director_name = link.text.strip()
|
||||||
|
director_href = host_url + link['href'] if link['href'] else ''
|
||||||
|
directors.append({"name": director_name, "href": director_href})
|
||||||
|
movie_data[key] = directors
|
||||||
|
else:
|
||||||
|
val = value.text.strip()
|
||||||
|
if key in ["Distributor", "Studio", "Director"]:
|
||||||
|
link = value.find("a")
|
||||||
|
if link:
|
||||||
|
val = link.text.strip()
|
||||||
|
movie_data[f'{key}Href'] = host_url + link['href']
|
||||||
|
movie_data[key] = val
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 解析演职人员信息
|
||||||
|
performers = []
|
||||||
|
cast_divs = soup.find_all("div", class_="castbox")
|
||||||
|
for cast in cast_divs:
|
||||||
|
performer = {}
|
||||||
|
link = cast.find("a")
|
||||||
|
if link:
|
||||||
|
performer["name"] = link.text.strip()
|
||||||
|
performer["href"] = host_url + link["href"]
|
||||||
|
|
||||||
|
#performer["tags"] = [
|
||||||
|
# tag.strip() for br in cast.find_all("br")
|
||||||
|
# if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
|
||||||
|
#]
|
||||||
|
|
||||||
|
tags = []
|
||||||
|
for br in cast.find_all("br"):
|
||||||
|
tag = br.next_sibling
|
||||||
|
if isinstance(tag, str) and tag.strip():
|
||||||
|
tags.append(tag.strip())
|
||||||
|
performer["tags"] = tags
|
||||||
|
|
||||||
|
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
|
||||||
|
performers.append(performer)
|
||||||
|
|
||||||
|
# 解析场景拆解
|
||||||
|
scene_breakdowns = []
|
||||||
|
scene_table = soup.find("div", id="sceneinfo")
|
||||||
|
if scene_table:
|
||||||
|
rows = scene_table.find_all("tr")
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
cols = row.find_all("td")
|
||||||
|
if len(cols) >= 2:
|
||||||
|
scene = cols[0].text.strip() # 场景编号
|
||||||
|
performer_info = cols[1] # 包含表演者及链接信息
|
||||||
|
|
||||||
|
# 获取 <br> 之前的完整 HTML(保留 <i> 标签等格式)
|
||||||
|
performer_html = str(performer_info) # 获取所有HTML内容
|
||||||
|
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
|
||||||
|
if split_html:
|
||||||
|
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||||
|
else:
|
||||||
|
split_html = performer_html.split("<br>") # 按 <br> 进行分割
|
||||||
|
if split_html:
|
||||||
|
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||||
|
else:
|
||||||
|
performers_html = performer_html.strip() # 如果没有 <br>,取全部
|
||||||
|
|
||||||
|
# 解析为纯文本(去除HTML标签,仅提取文本内容)
|
||||||
|
performers_soup = BeautifulSoup(performers_html, "html.parser")
|
||||||
|
performers_text = performers_soup.get_text()
|
||||||
|
|
||||||
|
# 提取表演者
|
||||||
|
scene_performers = [p.strip() for p in performers_text.split(",")]
|
||||||
|
|
||||||
|
# 尝试获取 `webscene` 和 `studio`
|
||||||
|
links_data = {}
|
||||||
|
links = performer_info.find_all("a")
|
||||||
|
if links:
|
||||||
|
webscene_title = links[0].text.strip() if len(links)>0 else None
|
||||||
|
webscene = links[0]["href"] if len(links)>0 else None
|
||||||
|
studio = links[1].text.strip() if len(links)>1 else None
|
||||||
|
studio_lnk = links[1]["href"] if len(links)>1 else None
|
||||||
|
links_data = {
|
||||||
|
"title": webscene_title,
|
||||||
|
"webscene": webscene,
|
||||||
|
"studio": studio,
|
||||||
|
"studio_lnk": studio_lnk,
|
||||||
|
}
|
||||||
|
|
||||||
|
scene_data = {
|
||||||
|
"scene": scene,
|
||||||
|
"performers": scene_performers,
|
||||||
|
**links_data,
|
||||||
|
}
|
||||||
|
scene_breakdowns.append(scene_data)
|
||||||
|
|
||||||
|
appears_in = []
|
||||||
|
appears_divs = soup.find("div", id="appearssection")
|
||||||
|
if appears_divs:
|
||||||
|
rows = appears_divs.find_all("li")
|
||||||
|
for row in rows:
|
||||||
|
lnk = row.find("a")
|
||||||
|
if lnk:
|
||||||
|
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
|
||||||
|
|
||||||
|
|
||||||
|
return {
|
||||||
|
"href": href,
|
||||||
|
"title": title,
|
||||||
|
"Minutes": movie_data.get("Minutes", ""),
|
||||||
|
"Distributor": movie_data.get("Distributor", ""),
|
||||||
|
"Studio": movie_data.get("Studio", ""),
|
||||||
|
"ReleaseDate": movie_data.get("Release Date", ""),
|
||||||
|
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||||||
|
"All-Girl": movie_data.get("All-Girl", ""),
|
||||||
|
"All-Male": movie_data.get("All-Male", ""),
|
||||||
|
"Compilation": movie_data.get("Compilation", ""),
|
||||||
|
"Webscene": movie_data.get("Webscene", ""),
|
||||||
|
"Director": movie_data.get("Director", ""),
|
||||||
|
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||||
|
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||||
|
"StudioHref": movie_data.get("StudioHref", ""),
|
||||||
|
"Directors": movie_data.get("Directors", []), # 可能存在的元素
|
||||||
|
"Performers": performers,
|
||||||
|
"SceneBreakdowns": scene_breakdowns,
|
||||||
|
"AppearsIn": appears_in,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
for astro in astro_list:
|
||||||
|
url = astr_base_url + astro
|
||||||
|
next_url = url
|
||||||
|
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = parse_page_astro(soup, astro)
|
||||||
|
if list_data:
|
||||||
|
print(list_data[0] if len(list_data)>0 else 'no data')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logging.info(f"Retrying {next_url} ...")
|
||||||
|
time.sleep(5) # 等待后再重试
|
||||||
|
|
||||||
|
time.sleep(2) # 控制访问频率
|
||||||
@ -129,3 +129,11 @@ def replace_lang_param(url: str) -> str:
|
|||||||
)
|
)
|
||||||
return urlunparse(new_parsed)
|
return urlunparse(new_parsed)
|
||||||
|
|
||||||
|
def pretty_json_simple(item):
|
||||||
|
try:
|
||||||
|
# 转换为单行JSON格式,需要保证传入的是map,不能是list
|
||||||
|
return json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
|
||||||
|
except:
|
||||||
|
# 转换失败时返回原始字符串
|
||||||
|
return item
|
||||||
|
|
||||||
Reference in New Issue
Block a user