modify scripts
This commit is contained in:
@ -133,8 +133,9 @@ fi
|
||||
|
||||
# 每月任务
|
||||
if [ "${PERIOD}" = "--monthly" ]; then
|
||||
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||
register_spider "pbox" "scrapy crawl javhd -a mod='update' "
|
||||
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||
register_spider "javhd" "scrapy crawl javhd -a mod='update' "
|
||||
register_spider "lord" "scrapy crawl lord -a mod='update' "
|
||||
fi
|
||||
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ from datetime import datetime
|
||||
from typing import List, Dict
|
||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
|
||||
import scrapy_proj.comm.comm_def as comm
|
||||
from scrapy_proj.utils.utils import pretty_json_simple
|
||||
|
||||
# 注册器字典
|
||||
spider_handler_registry = {}
|
||||
@ -609,3 +610,61 @@ class JavHDDBHandler(SQLiteDBHandler):
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"query error: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
@register_handler(comm.SPIDER_NAME_LORD)
|
||||
class LordDBHandler(SQLiteDBHandler):
|
||||
def __init__(self, db_path=shared_db_path):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_actors = 'thelordofporn_actress'
|
||||
self.tbl_name_alias = 'thelordofporn_alias'
|
||||
|
||||
def insert_item(self, item):
|
||||
if item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL:
|
||||
self.insert_actor(item)
|
||||
else:
|
||||
logging.error(f"unkown item.")
|
||||
|
||||
return item
|
||||
|
||||
def insert_actor(self, item):
|
||||
actor_id = self.insert_or_update_common(item, self.tbl_name_actors, uniq_key='href', exists_do_nothing=False)
|
||||
if actor_id:
|
||||
for alias in item.get('alias', []):
|
||||
alias_data = {'actress_id':actor_id, 'alias':alias}
|
||||
affected_rows = self.insert_or_update_with_composite_pk(data=alias_data, tbl_name=self.tbl_name_alias, composite_pk=['actress_id','alias'], exists_do_nothing=False)
|
||||
if affected_rows:
|
||||
logging.debug(f"insert/update actress_alias. data: {alias_data}")
|
||||
else:
|
||||
logging.warning(f"insert actor alias error!. data: {alias_data}")
|
||||
else:
|
||||
logging.warning(f"insert actor data error! data: {pretty_json_simple(item)}")
|
||||
|
||||
# 统计函数
|
||||
def get_stat(self):
|
||||
try:
|
||||
self.cursor.execute(f"""
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM {self.tbl_name_actors}) AS actor_cnt
|
||||
""")
|
||||
|
||||
row = self.cursor.fetchone()
|
||||
if not row:
|
||||
logging.warning(f"query no results.")
|
||||
return {}
|
||||
|
||||
columns = [desc[0] for desc in self.cursor.description]
|
||||
return dict(zip(columns, row))
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"query error: {e}")
|
||||
return {}
|
||||
|
||||
def has_full_data(self, href):
|
||||
try:
|
||||
self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_actors} WHERE is_full_data=1 and href = ?", (href,))
|
||||
row = self.cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"query error: {e}")
|
||||
return 0
|
||||
|
||||
@ -188,6 +188,68 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
def insert_or_update_with_composite_pk(self, data, tbl_name, composite_pk, exists_do_nothing=True):
|
||||
"""
|
||||
针对联合主键表执行插入或更新操作
|
||||
|
||||
:param table_name: 表名
|
||||
:param data: 字典类型,待插入或更新的数据
|
||||
:param composite_pk: 列表类型,联合主键字段名集合
|
||||
:param need_update: 布尔值,记录存在时是否更新,默认True
|
||||
:return: 操作影响的行数
|
||||
"""
|
||||
try:
|
||||
# 校验联合主键参数有效性
|
||||
if not isinstance(composite_pk, list) or len(composite_pk) < 2:
|
||||
logging.error(f"联合主键必须是包含至少两个字段的列表: {composite_pk}")
|
||||
return None
|
||||
|
||||
processed_data = self.check_and_process_data(data, tbl_name)
|
||||
|
||||
# 校验联合主键字段是否都在数据中存在
|
||||
for pk_field in composite_pk:
|
||||
if pk_field not in processed_data:
|
||||
logging.error(f"联合主键字段 '{pk_field}' 未在数据中提供")
|
||||
return None
|
||||
|
||||
# 构建查询条件
|
||||
where_conditions = " AND ".join([f"{pk} = ?" for pk in composite_pk])
|
||||
pk_values = [processed_data[pk] for pk in composite_pk]
|
||||
|
||||
# 检查记录是否存在
|
||||
self.cursor.execute(
|
||||
f"SELECT 1 FROM {tbl_name} WHERE {where_conditions}",
|
||||
pk_values
|
||||
)
|
||||
exists = self.cursor.fetchone() is not None
|
||||
|
||||
if exists:
|
||||
if exists_do_nothing:
|
||||
return 0
|
||||
|
||||
# 构建更新字段(排除联合主键字段)
|
||||
update_fields = [f for f in processed_data.keys() if f not in composite_pk]
|
||||
if not update_fields:
|
||||
return 0
|
||||
|
||||
set_clause = ", ".join([f"{field} = ?" for field in update_fields])
|
||||
update_values = [processed_data[field] for field in update_fields] + pk_values
|
||||
|
||||
# 执行更新(兼容低版本SQLite的标准语法)
|
||||
update_sql = f"UPDATE {tbl_name} SET {set_clause} WHERE {where_conditions}"
|
||||
self.cursor.execute(update_sql, update_values)
|
||||
return 1
|
||||
else:
|
||||
# 执行插入操作
|
||||
columns = ", ".join(processed_data.keys())
|
||||
placeholders = ", ".join(["?" for _ in processed_data.keys()])
|
||||
insert_sql = f"INSERT INTO {tbl_name} ({columns}) VALUES ({placeholders})"
|
||||
self.cursor.execute(insert_sql, list(processed_data.values()))
|
||||
return 2
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
def get_id_by_key(self, tbl, uniq_key, val):
|
||||
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
|
||||
row = self.cursor.fetchone()
|
||||
|
||||
@ -192,4 +192,33 @@ class JavHDActorItem(scrapy.Item):
|
||||
birth_date = scrapy.Field()
|
||||
ethnicity = scrapy.Field()
|
||||
birth_place = scrapy.Field()
|
||||
is_full_data = scrapy.Field()
|
||||
is_full_data = scrapy.Field()
|
||||
|
||||
|
||||
class LordActorItem(scrapy.Item):
|
||||
item_type = scrapy.Field()
|
||||
pornstar = scrapy.Field()
|
||||
rating = scrapy.Field()
|
||||
rank = scrapy.Field()
|
||||
votes = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
career_start = scrapy.Field()
|
||||
measurements = scrapy.Field()
|
||||
born = scrapy.Field()
|
||||
height = scrapy.Field()
|
||||
weight = scrapy.Field()
|
||||
date_modified = scrapy.Field()
|
||||
global_rank = scrapy.Field()
|
||||
weekly_rank = scrapy.Field()
|
||||
last_month_rating = scrapy.Field()
|
||||
current_rating = scrapy.Field()
|
||||
total_votes = scrapy.Field()
|
||||
birth_date = scrapy.Field()
|
||||
birth_year = scrapy.Field()
|
||||
birth_place = scrapy.Field()
|
||||
height_ft = scrapy.Field()
|
||||
height_cm = scrapy.Field()
|
||||
weight_lbs = scrapy.Field()
|
||||
weight_kg = scrapy.Field()
|
||||
is_full_data = scrapy.Field()
|
||||
alias = scrapy.Field()
|
||||
|
||||
@ -31,7 +31,7 @@ class BaseSpider(scrapy.Spider):
|
||||
yield request
|
||||
|
||||
def parse(self, response):
|
||||
"""统一的响应处理入口"""
|
||||
"""统一的响应处理入口,实际上没有起作用,因为直接走了 scrapy.Request 里的 callback """
|
||||
# 记录请求耗时
|
||||
request_time = response.meta.get('request_time')
|
||||
if request_time:
|
||||
|
||||
@ -1,15 +1,19 @@
|
||||
import scrapy
|
||||
import re
|
||||
import sys
|
||||
from urllib.parse import urljoin, quote_plus
|
||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
||||
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
||||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
||||
from scrapy_proj.spiders.parser.iafd_parser import common_parser
|
||||
from scrapy_proj.utils.utils import pretty_json_simple
|
||||
|
||||
db_tools = IAFDDBHandler()
|
||||
|
||||
class IAFDSpider(BaseSpider):
|
||||
name = SPIDER_NAME_IAFD
|
||||
allowed_domains = ["iafd.com"]
|
||||
allowed_domains = ["iafd.com", "www.iafd.com"]
|
||||
|
||||
host_url = "https://www.iafd.com"
|
||||
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||||
@ -19,10 +23,10 @@ class IAFDSpider(BaseSpider):
|
||||
studios_list_url = f"{host_url}/studio.asp"
|
||||
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||||
|
||||
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
|
||||
def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||
self.update = int(update)
|
||||
self.update_mode = True if mod and mod.lower() == 'update' else False
|
||||
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||||
|
||||
self.cmd_astro = 'astro'
|
||||
@ -64,8 +68,9 @@ class IAFDSpider(BaseSpider):
|
||||
query_args = {}
|
||||
if self.debug:
|
||||
query_args['limit'] = 5
|
||||
if self.update == 0:
|
||||
if self.update_mode:
|
||||
query_args['is_full_data'] = 0
|
||||
query_args['is_full_data'] = 404
|
||||
|
||||
# 读取待更新的演员列表
|
||||
if self.cmd_performers in self.cmd_list:
|
||||
@ -77,7 +82,7 @@ class IAFDSpider(BaseSpider):
|
||||
href = item.get('href', '')
|
||||
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
|
||||
self.logger.info(f"fetch from db. item: {item}")
|
||||
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
|
||||
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'})
|
||||
|
||||
# 读取待更新的影片列表
|
||||
if self.cmd_movies in self.cmd_list:
|
||||
@ -88,7 +93,7 @@ class IAFDSpider(BaseSpider):
|
||||
for item in movies:
|
||||
href = item.get('href', '')
|
||||
self.logger.info(f"fetch from db. item: {item}")
|
||||
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
|
||||
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'})
|
||||
|
||||
|
||||
def start_astro(self):
|
||||
@ -113,50 +118,28 @@ class IAFDSpider(BaseSpider):
|
||||
yield request
|
||||
|
||||
def parse_astro_page(self, response):
|
||||
astro = response.meta['astro']
|
||||
astro_div = response.css('div#astro')
|
||||
if astro_div:
|
||||
birth_date = None
|
||||
for elem in astro_div.css('*'):
|
||||
if elem.css('h3.astroday'):
|
||||
birth_date = elem.css('h3.astroday::text').get().strip()
|
||||
elif elem.css('div.perficon'):
|
||||
a_tag = elem.css('a')
|
||||
if a_tag:
|
||||
href = self.host_url + a_tag.attrib['href']
|
||||
name = a_tag.css('span.perfname::text').get()
|
||||
if name:
|
||||
item = IAFDPersonItem()
|
||||
item['name'] = name
|
||||
item['href'] = href
|
||||
item['from_astro_list'] = 1
|
||||
item['from_birth_list'] = 0
|
||||
item['from_ethnic_list'] = 0
|
||||
item['from_movie_list'] = 0
|
||||
yield item
|
||||
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
||||
astro = response.meta.get('astro', '')
|
||||
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
|
||||
def parse_birth_page(self, response):
|
||||
month = response.meta['month']
|
||||
day = response.meta['day']
|
||||
datarows = response.css('div.col-sm-12.col-lg-9')
|
||||
if datarows:
|
||||
rows = datarows[0].css('div.col-sm-4')
|
||||
for row in rows:
|
||||
link_tag = row.css('a')
|
||||
person = link_tag.css('::text').get().strip() if link_tag else ''
|
||||
href = self.host_url + link_tag.attrib['href'] if link_tag else ''
|
||||
|
||||
item = IAFDPersonItem()
|
||||
item['name'] = person
|
||||
item['href'] = href
|
||||
item['from_astro_list'] = 0
|
||||
item['from_birth_list'] = 1
|
||||
item['from_ethnic_list'] = 0
|
||||
item['from_movie_list'] = 0
|
||||
yield item
|
||||
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
||||
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
|
||||
def parse_ethnic_list_page(self, response):
|
||||
div_root = response.css('select#ethnicity1')
|
||||
if div_root:
|
||||
@ -167,40 +150,25 @@ class IAFDSpider(BaseSpider):
|
||||
href = option.attrib.get('value')
|
||||
text = option.css('::text').get().strip()
|
||||
if href and href.lower() != 'none':
|
||||
ethnic_url = self.host_url + href
|
||||
ethnic_url = urljoin(response.url , href)
|
||||
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
|
||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||
if self.debug:
|
||||
break
|
||||
|
||||
def parse_ethnic_page(self, response):
|
||||
ethnic = response.meta['ethnic']
|
||||
rows = response.css('div.row.headshotrow')
|
||||
for row in rows:
|
||||
cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
|
||||
for col in cols:
|
||||
link_tag = col.css('a')
|
||||
img_tag = col.css('div.pictag')
|
||||
if link_tag and img_tag:
|
||||
href = self.host_url + link_tag.attrib['href']
|
||||
person = img_tag.css('::text').get().strip()
|
||||
|
||||
item = IAFDPersonItem()
|
||||
item['name'] = person
|
||||
item['href'] = href
|
||||
item['from_astro_list'] = 0
|
||||
item['from_birth_list'] = 0
|
||||
item['from_ethnic_list'] = 1
|
||||
item['from_movie_list'] = 0
|
||||
yield item
|
||||
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
||||
|
||||
next_page = response.css('a[rel="next"]')
|
||||
if next_page:
|
||||
next_url = self.host_url + next_page.attrib['href']
|
||||
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
|
||||
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
else:
|
||||
self.crawler.stats.inc_value(f"{self.name}/ethnic_done")
|
||||
self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}")
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
|
||||
if next_url:
|
||||
self.logger.info(f"find next page: {next_url}")
|
||||
else:
|
||||
self.logger.info(f"found all pages. url: {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
|
||||
def parse_distributors_list_page(self, response):
|
||||
select_element = response.css('select[name="Distrib"]')
|
||||
@ -209,16 +177,8 @@ class IAFDSpider(BaseSpider):
|
||||
for option in options:
|
||||
value = option.attrib.get('value')
|
||||
text = option.css('::text').get().strip()
|
||||
dis_url = self.host_url + f"/distrib.rme/distrib={value}"
|
||||
item = IAFDMovieItem()
|
||||
item['title'] = text
|
||||
item['href'] = dis_url
|
||||
item['release_year'] = 0
|
||||
item['from_performer_list'] = 0
|
||||
item['from_dist_list'] = 1
|
||||
item['from_stu_list'] = 0
|
||||
yield item
|
||||
#yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
|
||||
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
|
||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||||
|
||||
def parse_studios_list_page(self, response):
|
||||
select_element = response.css('select[name="Studio"]')
|
||||
@ -227,47 +187,54 @@ class IAFDSpider(BaseSpider):
|
||||
for option in options:
|
||||
value = option.attrib.get('value')
|
||||
text = option.css('::text').get().strip()
|
||||
stu_url = self.host_url + f"/studio.rme/studio={value}"
|
||||
item = IAFDMovieItem()
|
||||
item['title'] = text
|
||||
item['href'] = stu_url
|
||||
item['release_year'] = 0
|
||||
item['from_performer_list'] = 0
|
||||
item['from_dist_list'] = 0
|
||||
item['from_stu_list'] = 1
|
||||
yield item
|
||||
#yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
|
||||
dis_url = f"{self.host_url}/studio.rme/studio={value}"
|
||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||||
|
||||
def parse_stu_dist_page(self, response):
|
||||
list_type = response.meta.get('list_type', '')
|
||||
data, next_url = common_parser(html=response.text, page=list_type)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
|
||||
|
||||
def parse_person_detail_page(self, response):
|
||||
data = common_parser(html=response.text, page='actor', url=response.url)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
item['href'] = response.url
|
||||
item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中
|
||||
# 解析其他详细信息,根据实际页面结构修改
|
||||
item['gender'] = response.css('span.gender::text').get()
|
||||
item['birthday'] = response.css('span.birthday::text').get()
|
||||
item['astrology'] = response.css('span.astrology::text').get()
|
||||
item['birthplace'] = response.css('span.birthplace::text').get()
|
||||
item['years_active'] = response.css('span.years_active::text').get()
|
||||
item['ethnicity'] = response.css('span.ethnicity::text').get()
|
||||
item['nationality'] = response.css('span.nationality::text').get()
|
||||
item['hair_colors'] = response.css('span.hair_colors::text').get()
|
||||
item['eye_color'] = response.css('span.eye_color::text').get()
|
||||
item['height'] = response.css('span.height::text').get()
|
||||
item['weight'] = response.css('span.weight::text').get()
|
||||
item['measurements'] = response.css('span.measurements::text').get()
|
||||
item['tattoos'] = response.css('span.tattoos::text').get()
|
||||
item['piercings'] = response.css('span.piercings::text').get()
|
||||
item['movies_cnt'] = response.css('span.movies_cnt::text').get()
|
||||
item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
|
||||
item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
|
||||
item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
|
||||
item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
|
||||
item['performer_aka'] = response.css('span.performer_aka::text').getall()
|
||||
yield item
|
||||
#yield item
|
||||
|
||||
def parse_movie_detail_page(self, response):
|
||||
title = response.meta.get('title', '')
|
||||
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
|
||||
item = IAFDMovieDetailItem()
|
||||
item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中
|
||||
item['href'] = response.url
|
||||
# 解析其他详细信息,根据实际页面结构修改
|
||||
yield item
|
||||
#yield item
|
||||
|
||||
def custom_block_check(self, response):
|
||||
item_type = response.meta.get('item_type', '')
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
|
||||
return "invalid or outdated page"
|
||||
else:
|
||||
self.logger.info(f"right content. url: {response.url}")
|
||||
|
||||
return None
|
||||
|
||||
# 处理页面异常,主要是404, 403
|
||||
def handle_blocked(self, response, reason):
|
||||
item_type = response.meta.get('item_type', '')
|
||||
if response.status in [404, 403]:
|
||||
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
|
||||
@ -111,7 +111,7 @@ class JavhdSpider(BaseSpider):
|
||||
item['rank'] = rank
|
||||
item['url'] = url
|
||||
item[f'{lang}_name'] = name
|
||||
#TODO: 非英语的页面,要去更新对应的名字
|
||||
# 非英语的页面,要去更新对应的名字
|
||||
if lang != 'en':
|
||||
item['url'] = replace_lang_param(item['url'])
|
||||
yield item
|
||||
@ -127,7 +127,7 @@ class JavhdSpider(BaseSpider):
|
||||
meta={"list_item": item} # 传递列表页数据到详情页
|
||||
)
|
||||
else:
|
||||
self.logger.info(f"actor(name) has full data. skip. url: {url}")
|
||||
self.logger.info(f"actor({name}) has full data. skip. url: {url}")
|
||||
|
||||
# 获取下一页
|
||||
next_path = data.get("pagination_params", {}).get("next")
|
||||
|
||||
399
scrapy_proj/scrapy_proj/spiders/lord_spider.py
Normal file
399
scrapy_proj/scrapy_proj/spiders/lord_spider.py
Normal file
@ -0,0 +1,399 @@
|
||||
import scrapy
|
||||
import sys
|
||||
import re
|
||||
from urllib.parse import urljoin, quote_plus
|
||||
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param, pretty_json_simple
|
||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||
from scrapy_proj.items import LordActorItem
|
||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_LORD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL
|
||||
from scrapy_proj.db_wapper.spider_db_handler import LordDBHandler
|
||||
|
||||
db_tools = LordDBHandler()
|
||||
|
||||
class LordSpider(BaseSpider):
|
||||
name = SPIDER_NAME_LORD
|
||||
allowed_domains = ["www.thelordofporn.com", "thelordofporn.com"]
|
||||
|
||||
# 配置请求头(复用curl中的头部信息)
|
||||
custom_settings = {
|
||||
"DEFAULT_REQUEST_HEADERS": {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||||
"if-modified-since": "Wed, 23 Jul 2025 14:34:28 GMT",
|
||||
"priority": "u=0, i",
|
||||
"sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": "\"macOS\"",
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "none",
|
||||
"sec-fetch-user": "?1",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0"
|
||||
},
|
||||
"COOKIES_ENABLED": True # 启用Cookie支持
|
||||
}
|
||||
|
||||
def __init__(self, debug='false', mod='update', *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||
self.update_mod = False if mod and mod.lower() == 'force' else True
|
||||
|
||||
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||||
|
||||
# 入口函数,由基类的方法触发
|
||||
def custom_start_requests(self):
|
||||
url = 'https://thelordofporn.com/pornstars/'
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||||
callback=self.parse_list,
|
||||
meta={} # 传递列表页数据到详情页
|
||||
)
|
||||
|
||||
def parse_list(self, response):
|
||||
# 提取所有演员条目(对应原代码中的article.loop-item)
|
||||
articles = response.css("article.loop-item")
|
||||
self.logger.info(f"当前页({response.url})找到 {len(articles)} 个演员条目")
|
||||
|
||||
for article in articles:
|
||||
try:
|
||||
# 提取演员名称和详情页链接
|
||||
title_tag = article.css("h3.loop-item__title a")
|
||||
title = title_tag.css("::text").get(default="N/A").strip()
|
||||
href = title_tag.attrib.get("href") # 获取a标签的href属性
|
||||
|
||||
# 提取评分
|
||||
rating = article.css("div.loop-item__rating::text").get(default="N/A").strip()
|
||||
|
||||
# 提取排名和投票数(对应原代码中的meta_tags)
|
||||
meta_tags = article.css("div.loop-item__rank span")
|
||||
rank = None
|
||||
votes = None
|
||||
|
||||
# 解析排名(第一个span中的b标签)
|
||||
if len(meta_tags) >= 1:
|
||||
rank_b = meta_tags[0].css("b::text").get()
|
||||
rank = rank_b.strip() if rank_b else "N/A"
|
||||
|
||||
# 解析投票数(第二个span中的b标签)
|
||||
if len(meta_tags) >= 2:
|
||||
votes_b = meta_tags[1].css("b::text").get()
|
||||
votes = votes_b.strip() if votes_b else "N/A"
|
||||
|
||||
# 转换为数值类型(模拟原代码中的utils.parse_numeric)
|
||||
def parse_numeric(value):
|
||||
if not value or value == "N/A":
|
||||
return None
|
||||
# 移除非数字字符(如逗号、%等)
|
||||
numeric_str = ''.join(filter(str.isdigit, value))
|
||||
return int(numeric_str) if numeric_str else None
|
||||
|
||||
# 构建演员数据字典
|
||||
actress_data = {
|
||||
"pornstar": title,
|
||||
"rating": parse_numeric(rating),
|
||||
"rank": parse_numeric(rank),
|
||||
"votes": parse_numeric(votes),
|
||||
"href": href if href else None
|
||||
}
|
||||
# 发起详情查询
|
||||
actor_exists = 0 if not self.update_mod else db_tools.has_full_data(href)
|
||||
if actor_exists < 1 :
|
||||
yield scrapy.Request(
|
||||
url=href,
|
||||
callback=self.parse_actor_detail,
|
||||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
|
||||
meta = {'actor':actress_data}
|
||||
)
|
||||
else:
|
||||
self.logger.info(f"actor({title}) has full data. skip. url: {href}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析演员条目失败: {e}, 页面: {response.url}")
|
||||
continue # 跳过错误条目,继续解析下一个
|
||||
|
||||
# 提取下一页链接(对应原代码中的.next.page-numbers)
|
||||
next_page_url = None
|
||||
next_page_tag = response.css(".nav-links .next.page-numbers")
|
||||
if next_page_tag:
|
||||
next_page_href = next_page_tag.attrib.get("href")
|
||||
if next_page_href and not self.debug:
|
||||
# 拼接完整URL(处理相对路径)
|
||||
next_page_url = urljoin(response.url, next_page_href)
|
||||
yield scrapy.Request(
|
||||
url=next_page_url,
|
||||
callback=self.parse_list,
|
||||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
|
||||
meta = {}
|
||||
)
|
||||
else:
|
||||
self.logger.info(f"已解析所有页面, current url: {response.url}")
|
||||
|
||||
def parse_actor_detail(self, response):
|
||||
# 1. 定义字段映射表:页面原始字段 -> Item字段
|
||||
FIELD_MAPPING = {
|
||||
# 基本信息
|
||||
'date_modified': 'date_modified',
|
||||
# 排名信息
|
||||
'Global Rank': 'global_rank',
|
||||
'Weekly Rank': 'weekly_rank',
|
||||
# 评分信息
|
||||
'Last Month': 'last_month_rating',
|
||||
'Rating Av.': 'current_rating',
|
||||
'Total of Votes': 'total_votes',
|
||||
# 详细属性
|
||||
'Career start': 'career_start',
|
||||
'Measurements': 'measurements',
|
||||
'Born': 'born',
|
||||
'Height': 'height',
|
||||
'Weight': 'weight',
|
||||
'Name': 'alias_raw', # 别名对应Name字段
|
||||
# 解析后字段(出生/身高/体重)
|
||||
'birth_date': 'birth_date',
|
||||
'birth_year': 'birth_year',
|
||||
'birth_place': 'birth_place',
|
||||
'height_ft': 'height_ft',
|
||||
'height_cm': 'height_cm',
|
||||
'weight_lbs': 'weight_lbs',
|
||||
'weight_kg': 'weight_kg',
|
||||
'alias':'alias'
|
||||
}
|
||||
|
||||
# 2. 初始化原始数据容器
|
||||
raw_data = {}
|
||||
# 3. 提取基础信息
|
||||
raw_data['href'] = response.url
|
||||
entry_header = response.css("header.entry-header")
|
||||
raw_data['name'] = entry_header.css("h1.entry-title::text").get(default="").strip()
|
||||
raw_data['date_modified'] = entry_header.css("time[itemprop='dateModified']::attr(content)").get(default="").strip()
|
||||
|
||||
# 4. 提取排名信息
|
||||
for item in entry_header.css("div.porn-star-rank__item"):
|
||||
item_text = item.css("::text").get(default="").strip()
|
||||
raw_data[item_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
|
||||
|
||||
# 5. 提取评分和投票信息
|
||||
for item in response.css("div.specifications__item--horizontal"):
|
||||
# 1. 精准定位标题区域(排除b标签)
|
||||
# 情况1:有子div的结构(如Rating Av.带img)
|
||||
title_div = item.css("div:first-child")
|
||||
if title_div:
|
||||
# 只提取子div内的文本(自动排除同级的b标签)
|
||||
title_parts = title_div.css("::text").getall()
|
||||
else:
|
||||
# 情况2和3:无子div的结构(Last Month和Total of Votes)
|
||||
# 提取当前item内所有文本,但排除b标签的内容
|
||||
all_text_parts = item.css("::text").getall()
|
||||
b_text_parts = item.css("b::text").getall()
|
||||
# 从所有文本中移除b标签的文本
|
||||
title_parts = [t for t in all_text_parts if t not in b_text_parts]
|
||||
|
||||
# 2. 清理标题文本(处理非断空格和空白)
|
||||
title_text = "".join(title_parts)
|
||||
title_text = title_text.replace(u'\xa0', u' ') # 替换非断空格
|
||||
title_text = re.sub(r'\s+', ' ', title_text).strip() # 合并空白
|
||||
|
||||
raw_data[title_text] = self.parse_numeric(extract_text_from_element(item.css("b")))
|
||||
|
||||
# 6. 提取详细属性(specifications-grid-row)
|
||||
for row in response.css("div.specifications-grid-row"):
|
||||
items = row.css("div.specifications-grid-item")
|
||||
for i in [0, 1]: # 处理每行2个属性
|
||||
if i < len(items):
|
||||
label = extract_text_from_element(items[i].css("h5"))
|
||||
value = extract_text_from_element(items[i].css("span"))
|
||||
if label:
|
||||
raw_data[label] = value
|
||||
|
||||
# 7. 处理特殊字段(别名需要清洗)
|
||||
raw_data['alias'] = self.clean_alias(raw_data.get("Name", ""))
|
||||
|
||||
# 9. 解析出生信息、身高、体重并合并
|
||||
raw_data.update(self.parse_birth_info(raw_data.get("Born", "")))
|
||||
raw_data.update(self.parse_height(raw_data.get("Height", "")))
|
||||
raw_data.update(self.parse_weight(raw_data.get("Weight", "")))
|
||||
|
||||
# 10. 映射到Item并返回
|
||||
item = LordActorItem()
|
||||
item['item_type'] = ITEM_TYPE_ACTOR_DETAIL
|
||||
actor_data = response.meta['actor']
|
||||
for k, v in actor_data.items():
|
||||
if k in item.fields:
|
||||
item[k] = v
|
||||
|
||||
for raw_field, item_field in FIELD_MAPPING.items():
|
||||
if item_field in item.fields:
|
||||
item[item_field] = raw_data.get(raw_field, "")
|
||||
|
||||
# 标记为完整数据
|
||||
item['is_full_data'] = 1
|
||||
self.logger.info(f"actor data: {raw_data}, meta: {response.meta['actor']}, item: {pretty_json_simple(item)}")
|
||||
|
||||
yield item
|
||||
|
||||
# 保留原工具函数(需作为Spider类的方法)
|
||||
def parse_birth_info(self, text):
|
||||
match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text, re.IGNORECASE)
|
||||
if match:
|
||||
return {
|
||||
"birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
|
||||
"birth_year": match.group(3),
|
||||
"birth_place": match.group(4),
|
||||
}
|
||||
return {"birth_date": text, "birth_year": "", "birth_place": ""}
|
||||
|
||||
|
||||
def parse_height2(self, text):
|
||||
match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text, re.IGNORECASE)
|
||||
if match:
|
||||
height_ft = f"{match.group(1)}'{match.group(2)}\""
|
||||
return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
|
||||
return {"height_ft": text, "height_cm": ""}
|
||||
def parse_height(self, text):
|
||||
# 统一预处理:替换逗号为小数点,处理常见笔误(如'n'→'in')
|
||||
text = text.replace(',', '.').replace(' n ', ' in ').strip()
|
||||
|
||||
# 正则表达式:匹配所有英尺+英寸格式(支持多种表达方式)
|
||||
# 分组说明:
|
||||
# 1. 英尺数值 2. 英尺单位(feet/ft/ft./') 3. 英寸数值 4. 英寸单位(inches/in/in./inch/")
|
||||
# 5. 厘米/米数值 6. 单位(cm/m)
|
||||
pattern = r"""
|
||||
# 情况1:先英尺英寸,后厘米/米(主流格式)
|
||||
(?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分(如5ft/5')
|
||||
(?:and\s*)? # 可选的"and"(如5 feet and 2 inches)
|
||||
(\d+)\s*(inches|in\.?|inch|")?\s* # 英寸部分(如2in/2")
|
||||
(?:\(?(\d+\.?\d*)\s*(cm|m)\)?) # 厘米/米部分(如(157cm)/(1.57m))
|
||||
|
||||
| # 或
|
||||
|
||||
# 情况2:先厘米,后英尺英寸(如170 cm / 5 feet and 7 inches)
|
||||
(\d+)\s*cm\s*/\s* # 厘米在前
|
||||
(?:(\d+)\s*(feet|ft\.?|')\s*) # 英尺部分
|
||||
(?:and\s*)?
|
||||
(\d+)\s*(inches|in\.?|inch|")? # 英寸部分
|
||||
|
||||
| # 或
|
||||
|
||||
# 情况3:纯简写格式(如5'3" (160 cm))
|
||||
(\d+)'(\d+)"\s*\(?(\d+)\s*cm\)? # 5'3"格式
|
||||
"""
|
||||
|
||||
# 使用VERBOSE忽略正则中的空格,IGNORECASE忽略大小写
|
||||
match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
|
||||
if not match:
|
||||
# 处理纯厘米格式(如"160cm")
|
||||
cm_match = re.match(r'(\d+)\s*cm', text, re.IGNORECASE)
|
||||
if cm_match:
|
||||
return {"height_ft": "", "height_cm": cm_match.group(1)}
|
||||
return {"height_ft": text, "height_cm": ""}
|
||||
|
||||
# 提取匹配结果(根据不同情况处理分组)
|
||||
ft = None
|
||||
inch = None
|
||||
cm = None
|
||||
|
||||
# 情况1:先英尺英寸后厘米/米
|
||||
if match.group(1) and match.group(3):
|
||||
ft = match.group(1)
|
||||
inch = match.group(3)
|
||||
num = match.group(5)
|
||||
unit = match.group(6).lower() if match.group(6) else 'cm'
|
||||
|
||||
# 情况2:先厘米后英尺英寸
|
||||
elif match.group(7):
|
||||
cm = match.group(7)
|
||||
ft = match.group(8)
|
||||
inch = match.group(10)
|
||||
unit = 'cm' # 情况2中前面的单位固定为cm
|
||||
|
||||
# 情况3:纯简写格式(5'3")
|
||||
elif match.group(11) and match.group(12):
|
||||
ft = match.group(11)
|
||||
inch = match.group(12)
|
||||
cm = match.group(13)
|
||||
unit = 'cm'
|
||||
|
||||
# 处理厘米/米转换(米转厘米)
|
||||
if not cm and num and unit:
|
||||
if unit == 'm':
|
||||
cm = str(int(float(num) * 100)) # 1.57m → 157cm
|
||||
else:
|
||||
cm = num # 直接使用cm数值
|
||||
|
||||
# 格式化英尺英寸表达式(如5'2")
|
||||
height_ft = f"{ft}'{inch}\"" if ft and inch else ""
|
||||
|
||||
return {"height_ft": height_ft.strip(), "height_cm": cm.strip() if cm else ""}
|
||||
|
||||
|
||||
def parse_weight2(self, text):
|
||||
match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text, re.IGNORECASE)
|
||||
if match:
|
||||
return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
|
||||
return {"weight_lbs": text, "weight_kg": ""}
|
||||
|
||||
def parse_weight(self, text):
|
||||
# 预处理:清理空格和常见格式问题
|
||||
text = text.strip().replace(' ', ' ')
|
||||
|
||||
# 正则表达式:匹配多种体重格式
|
||||
# 分组说明:
|
||||
# 1. 磅数值 2. 磅单位(lb/lbs/pounds) 3. 千克数值 4. 千克单位(kg)
|
||||
# 5. 千克在前的数值 6. 千克单位 7. 磅在后的数值 8. 磅单位
|
||||
pattern = r"""
|
||||
# 情况1:磅在前,千克在后(主流格式)
|
||||
(?:(\d+)\s*(lb|lbs|pounds)?\s*) # 磅部分(支持lb/lbs/pounds或省略单位)
|
||||
(?:\(?\s*(\d+)\s*(kg)\s*\)?) # 千克部分(如(45 kg))
|
||||
|
||||
| # 或
|
||||
|
||||
# 情况2:千克在前,磅在后(如52 kg / 114 lbs)
|
||||
(?:(\d+)\s*(kg)\s*/\s*) # 千克部分
|
||||
(\d+)\s*(lb|lbs|pounds)? # 磅部分
|
||||
"""
|
||||
|
||||
# 使用VERBOSE和IGNORECASE标志增强兼容性
|
||||
match = re.match(pattern, text, re.VERBOSE | re.IGNORECASE)
|
||||
if not match:
|
||||
# 尝试匹配纯千克格式(如"52kg")
|
||||
kg_match = re.match(r'(\d+)\s*kg', text, re.IGNORECASE)
|
||||
if kg_match:
|
||||
return {"weight_lbs": "", "weight_kg": kg_match.group(1)}
|
||||
|
||||
# 尝试匹配纯磅格式(如"114lb")
|
||||
lb_match = re.match(r'(\d+)\s*(lb|lbs|pounds)', text, re.IGNORECASE)
|
||||
if lb_match:
|
||||
return {"weight_lbs": lb_match.group(1), "weight_kg": ""}
|
||||
|
||||
# 完全无法解析的情况
|
||||
return {"weight_lbs": text, "weight_kg": ""}
|
||||
|
||||
# 提取匹配结果(根据不同情况处理分组)
|
||||
weight_lbs = None
|
||||
weight_kg = None
|
||||
|
||||
# 情况1:磅在前,千克在后
|
||||
if match.group(1) and match.group(3):
|
||||
weight_lbs = match.group(1)
|
||||
weight_kg = match.group(3)
|
||||
|
||||
# 情况2:千克在前,磅在后
|
||||
elif match.group(5) and match.group(6):
|
||||
weight_kg = match.group(5)
|
||||
weight_lbs = match.group(7)
|
||||
|
||||
return {
|
||||
"weight_lbs": weight_lbs.strip() if weight_lbs else "",
|
||||
"weight_kg": weight_kg.strip() if weight_kg else ""
|
||||
}
|
||||
|
||||
def clean_alias(self, alias):
|
||||
alias = re.sub(r'\(Age \d+\)', '', alias, re.IGNORECASE)
|
||||
return [name.strip() for name in alias.split(',') if name.strip()]
|
||||
|
||||
def parse_numeric(self, value):
|
||||
try:
|
||||
return float(value)
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
636
scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
Normal file
636
scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py
Normal file
@ -0,0 +1,636 @@
|
||||
|
||||
import cloudscraper
|
||||
import time
|
||||
import json
|
||||
import csv
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import RequestException
|
||||
from functools import partial
|
||||
#import config
|
||||
#import utils
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.iafd.com"
|
||||
|
||||
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||||
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||||
|
||||
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||||
|
||||
distributors_list_url = f'{host_url}/distrib.asp'
|
||||
distributors_base_url = f"{host_url}/distrib.rme/distrib="
|
||||
|
||||
studios_list_url = f"{host_url}/studio.asp"
|
||||
studios_base_url = f"{host_url}/studio.rme/studio="
|
||||
|
||||
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
http_code_404 = 404
|
||||
http_code_login = 401
|
||||
http_code_url = 601
|
||||
http_code_local = 99
|
||||
|
||||
save_raw_html = True
|
||||
load_from_local = False
|
||||
|
||||
def common_parser(html, page, **kwargs):
|
||||
parser = "lxml" if page=='ethnic' else "html.parser"
|
||||
soup = BeautifulSoup(html, parser)
|
||||
if not soup:
|
||||
return None
|
||||
if page == 'astro':
|
||||
#parse_page_astro(soup, astro):
|
||||
return parse_page_astro(soup, **kwargs)
|
||||
elif page == 'birth':
|
||||
#parse_page_birth(soup, month, day):
|
||||
return parse_page_birth(soup, **kwargs)
|
||||
elif page == 'ethnic':
|
||||
#parse_page_ethnic(soup, ethnic):
|
||||
return parse_page_ethnic(soup, **kwargs)
|
||||
elif page == 'dist':
|
||||
return parse_page_dist_stu(soup,'distable')
|
||||
elif page == 'stu':
|
||||
return parse_page_dist_stu(soup,'studio')
|
||||
elif page == 'actor':
|
||||
#parse_page_performer(soup, url):
|
||||
return parse_page_performer(soup, **kwargs)
|
||||
elif page == 'movies':
|
||||
#parse_page_movie(soup, href, title)
|
||||
return parse_page_movie(soup, **kwargs)
|
||||
else:
|
||||
logging.warning(f"wrong page: {page}")
|
||||
return None
|
||||
|
||||
'''
|
||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||
if load_from_local: # 从本地读取的逻辑
|
||||
html = utils.read_raw_html(url)
|
||||
if html:
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
html_text = preprocessor(html) if preprocessor else html
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
if host_url not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None, http_code_url
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.debug(f"Page not found (404): {url}")
|
||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 过期的网页,与404相同处理
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
logging.debug(f"invalid or outdated page: {url}")
|
||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
if save_raw_html:
|
||||
utils.write_raw_html(url, response.text)
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup, response.status_code
|
||||
else:
|
||||
# 检查是否发生跳转,比如到登录页面
|
||||
if response.history:
|
||||
logging.warning(f"Page redirected on {url}. Validation failed.")
|
||||
return None, http_code_login
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
'''
|
||||
|
||||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||
def preprocess_html(html):
|
||||
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||||
|
||||
# 通用的 HTML 结构验证器
|
||||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||
if attr_type == "id":
|
||||
return soup.find(tag, id=identifier) is not None
|
||||
elif attr_type == "class":
|
||||
return bool(soup.find_all(tag, class_=identifier))
|
||||
elif attr_type == "name":
|
||||
return bool(soup.find('select', {'name': identifier}))
|
||||
return False
|
||||
|
||||
# 检查电影信息是否存在
|
||||
def movie_validator(soup, table_id):
|
||||
return soup.find("table", id=table_id) is not None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_ethnic_list(soup, href):
|
||||
div_root = soup.find("select", id="ethnicity1")
|
||||
if not div_root:
|
||||
logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
|
||||
return None, None
|
||||
|
||||
list_data = []
|
||||
|
||||
# 提取所有的 <option> 标签
|
||||
options = div_root.find_all('option')
|
||||
if options:
|
||||
# 解析并输出 value 和文本内容
|
||||
for option in options:
|
||||
href = option.get('value', None)
|
||||
text = option.text.strip()
|
||||
if href and href.lower() == 'none':
|
||||
continue
|
||||
list_data.append({
|
||||
"name": text,
|
||||
"href": host_url + href if href else ''
|
||||
})
|
||||
return list_data
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_astro(soup, astro):
|
||||
astro_div = soup.find("div", id="astro")
|
||||
if not astro_div:
|
||||
logging.warning(f"Warning: No 'astro' div found in {astro}")
|
||||
return None, None
|
||||
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
birth_date = None
|
||||
for elem in astro_div.find_all(recursive=False):
|
||||
if elem.name == "h3" and "astroday" in elem.get("class", []):
|
||||
birth_date = elem.get_text(strip=True)
|
||||
elif elem.name == "div" and "perficon" in elem.get("class", []):
|
||||
a_tag = elem.find("a")
|
||||
if a_tag:
|
||||
href = host_url + a_tag["href"]
|
||||
name = a_tag.find("span", class_="perfname")
|
||||
if name:
|
||||
list_data.append({
|
||||
"astrology": astro,
|
||||
"birth_date": birth_date,
|
||||
"person": name.get_text(strip=True),
|
||||
"href": href
|
||||
})
|
||||
flag = True
|
||||
list_cnt = list_cnt +1
|
||||
if flag:
|
||||
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
# 解析页面内容并更新birth_map
|
||||
def parse_page_birth(soup, month, day):
|
||||
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
|
||||
if not datarows:
|
||||
return None, None
|
||||
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
list_data = []
|
||||
next_url = None
|
||||
rows = datarows[0].find_all('div', class_='col-sm-4')
|
||||
for row in rows:
|
||||
link_tag = row.find('a')
|
||||
person = link_tag.text.strip() if link_tag else ''
|
||||
href = link_tag['href'] if link_tag else ''
|
||||
href = host_url + href
|
||||
|
||||
# 如果 href 已经在 birth_map 中,跳过
|
||||
flag = True
|
||||
if any(entry['href'] == href for entry in list_data):
|
||||
continue
|
||||
|
||||
# 将数据添加到 birth_map
|
||||
list_data.append({
|
||||
'month': month,
|
||||
'day': day,
|
||||
'person': person,
|
||||
'href': href
|
||||
})
|
||||
list_cnt = list_cnt +1
|
||||
|
||||
if flag:
|
||||
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_ethnic(soup, ethnic):
|
||||
rows = soup.find_all('div', class_='row headshotrow')
|
||||
flag = False
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
for row in rows:
|
||||
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
|
||||
link_tag = col.find('a')
|
||||
img_tag = col.find('div', class_='pictag')
|
||||
flag = True
|
||||
|
||||
if link_tag and img_tag:
|
||||
href = host_url + link_tag['href']
|
||||
person = img_tag.text.strip()
|
||||
|
||||
# 将数据存储到 ethnic_map
|
||||
list_data.append({
|
||||
'ethnic': ethnic,
|
||||
'person': person,
|
||||
'href': href
|
||||
})
|
||||
if flag:
|
||||
logging.debug(f"get {len(list_data)} persons from this page.")
|
||||
|
||||
next_page = soup.find('a', rel='next')
|
||||
if next_page:
|
||||
next_url = host_url + next_page['href']
|
||||
logging.debug(f"Found next page: {next_url}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
logging.debug(f"All pages fetched for {ethnic}.")
|
||||
return list_data, None
|
||||
else:
|
||||
return None, None
|
||||
|
||||
# 解析列表页
|
||||
def parse_page_dist_stu_list(soup, select_name):
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
select_element = soup.find('select', {'name': select_name})
|
||||
if select_element :
|
||||
options = select_element.find_all('option')
|
||||
for option in options:
|
||||
value = option.get('value') # 获取 value 属性
|
||||
text = option.text.strip() # 获取文本内容
|
||||
list_data.append({
|
||||
'name' : text,
|
||||
'href' : str(value)
|
||||
})
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_dist_stu(soup, table_id):
|
||||
table = soup.find("table", id=table_id)
|
||||
if not table:
|
||||
logging.warning(f"Warning: No {table_id} table found ")
|
||||
return None, None
|
||||
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 5:
|
||||
title = cols[0].text.strip()
|
||||
label = cols[1].text.strip()
|
||||
year = cols[2].text.strip()
|
||||
rev = cols[3].text.strip()
|
||||
a_href = cols[0].find('a')
|
||||
href = host_url + a_href['href'] if a_href else ''
|
||||
|
||||
list_data.append({
|
||||
'title': title,
|
||||
'label': label,
|
||||
'year': year,
|
||||
'rev': rev,
|
||||
'href': href
|
||||
})
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
# 解析 作品列表,有个人出演,也有导演的
|
||||
def parse_credits_table(table, distributor_list):
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
movies = []
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
|
||||
# rows = table.find_all('tr', class_='we')
|
||||
for row in rows:
|
||||
#tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串
|
||||
tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 6:
|
||||
title = cols[0].text.strip()
|
||||
href_a = cols[0].find('a')
|
||||
href = href_a['href'] if href_a else ''
|
||||
year = cols[1].text.strip()
|
||||
distributor = cols[2].text.strip().lower()
|
||||
href_d = cols[2].find('a')
|
||||
href_dist = host_url + href_d['href'] if href_d else ''
|
||||
notes = cols[3].text.strip()
|
||||
rev = cols[4].text.strip()
|
||||
formats = cols[5].text.strip()
|
||||
|
||||
for key in distributor_list:
|
||||
if key in distributor:
|
||||
distributor_count[key] += 1
|
||||
|
||||
movies.append({
|
||||
'title': title,
|
||||
'href' : href,
|
||||
'year': year,
|
||||
'distributor': distributor,
|
||||
'distributor_href': href_dist,
|
||||
'notes': notes,
|
||||
'rev': rev,
|
||||
'formats': formats,
|
||||
'tr_class': tr_class
|
||||
})
|
||||
return movies, distributor_count
|
||||
|
||||
|
||||
# 请求网页并提取所需数据
|
||||
def parse_page_performer(soup, url):
|
||||
# 提取数据
|
||||
data = {}
|
||||
|
||||
# 定义我们需要的字段名称和HTML中对应的标签
|
||||
fields = {
|
||||
'performer_aka': 'Performer AKA',
|
||||
'birthday': 'Birthday',
|
||||
'astrology': 'Astrology',
|
||||
'birthplace': 'Birthplace',
|
||||
'gender': 'Gender',
|
||||
'years_active': 'Years Active',
|
||||
'ethnicity': 'Ethnicity',
|
||||
'nationality': 'Nationality',
|
||||
'hair_colors': 'Hair Colors',
|
||||
'eye_color': 'Eye Color',
|
||||
'height': 'Height',
|
||||
'weight': 'Weight',
|
||||
'measurements': 'Measurements',
|
||||
'tattoos': 'Tattoos',
|
||||
'piercings': 'Piercings'
|
||||
}
|
||||
reversed_map = {v: k for k, v in fields.items()}
|
||||
|
||||
# 解析表格数据, 获取参演或者导演的列表
|
||||
role_list = ['personal', 'directoral']
|
||||
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
|
||||
credits_list = {}
|
||||
|
||||
# 使用字典来存储统计
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
for role in role_list:
|
||||
table = soup.find('table', id=role)
|
||||
if table :
|
||||
movies, stat_map = parse_credits_table(table, distributor_list)
|
||||
credits_list[role] = movies
|
||||
# 更新 distributor 统计
|
||||
for distributor in distributor_list:
|
||||
distributor_count[distributor] += stat_map.get(distributor, 0)
|
||||
|
||||
# 统计 movies 数量
|
||||
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
|
||||
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
|
||||
|
||||
# 如果没有找到
|
||||
if len(credits_list) == 0 :
|
||||
logging.warning(f"movie table empty. url: {url} ")
|
||||
|
||||
# 遍历每个 bioheading, 获取metadata
|
||||
bioheadings = soup.find_all('p', class_='bioheading')
|
||||
for bio in bioheadings:
|
||||
heading = bio.text.strip()
|
||||
biodata = None
|
||||
|
||||
# 如果包含 "Performer",需要特殊处理
|
||||
if 'Performer' in heading:
|
||||
heading = 'Performer AKA'
|
||||
biodata_div = bio.find_next('div', class_='biodata')
|
||||
if biodata_div:
|
||||
div_text = biodata_div.get_text(separator='|').strip()
|
||||
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
|
||||
else:
|
||||
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
|
||||
|
||||
# 保存数据
|
||||
if heading in reversed_map:
|
||||
kkey = reversed_map[heading]
|
||||
data[kkey] = biodata
|
||||
|
||||
# 添加统计数据到 data
|
||||
data['movies_cnt'] = movies_cnt
|
||||
data['vixen_cnt'] = distributor_count['vixen']
|
||||
data['blacked_cnt'] = distributor_count['blacked']
|
||||
data['tushy_cnt'] = distributor_count['tushy']
|
||||
data['x_art_cnt'] = distributor_count['x-art']
|
||||
data['credits'] = credits_list
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
||||
# 解析网页 HTML 并提取电影信息
|
||||
def parse_page_movie(soup, href, title):
|
||||
# 解析电影基础信息
|
||||
movie_data = {}
|
||||
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
|
||||
if info_div:
|
||||
labels = info_div.find_all("p", class_="bioheading")
|
||||
values = info_div.find_all("p", class_="biodata")
|
||||
for label, value in zip(labels, values):
|
||||
key = label.text.strip()
|
||||
if key == "Directors": # 解析多位导演的情况
|
||||
directors = []
|
||||
links = value.find_all("a")
|
||||
for link in links:
|
||||
director_name = link.text.strip()
|
||||
director_href = host_url + link['href'] if link['href'] else ''
|
||||
directors.append({"name": director_name, "href": director_href})
|
||||
movie_data[key] = directors
|
||||
else:
|
||||
val = value.text.strip()
|
||||
if key in ["Distributor", "Studio", "Director"]:
|
||||
link = value.find("a")
|
||||
if link:
|
||||
val = link.text.strip()
|
||||
movie_data[f'{key}Href'] = host_url + link['href']
|
||||
movie_data[key] = val
|
||||
else:
|
||||
return None
|
||||
|
||||
# 解析演职人员信息
|
||||
performers = []
|
||||
cast_divs = soup.find_all("div", class_="castbox")
|
||||
for cast in cast_divs:
|
||||
performer = {}
|
||||
link = cast.find("a")
|
||||
if link:
|
||||
performer["name"] = link.text.strip()
|
||||
performer["href"] = host_url + link["href"]
|
||||
|
||||
#performer["tags"] = [
|
||||
# tag.strip() for br in cast.find_all("br")
|
||||
# if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
|
||||
#]
|
||||
|
||||
tags = []
|
||||
for br in cast.find_all("br"):
|
||||
tag = br.next_sibling
|
||||
if isinstance(tag, str) and tag.strip():
|
||||
tags.append(tag.strip())
|
||||
performer["tags"] = tags
|
||||
|
||||
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
|
||||
performers.append(performer)
|
||||
|
||||
# 解析场景拆解
|
||||
scene_breakdowns = []
|
||||
scene_table = soup.find("div", id="sceneinfo")
|
||||
if scene_table:
|
||||
rows = scene_table.find_all("tr")
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) >= 2:
|
||||
scene = cols[0].text.strip() # 场景编号
|
||||
performer_info = cols[1] # 包含表演者及链接信息
|
||||
|
||||
# 获取 <br> 之前的完整 HTML(保留 <i> 标签等格式)
|
||||
performer_html = str(performer_info) # 获取所有HTML内容
|
||||
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
split_html = performer_html.split("<br>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
performers_html = performer_html.strip() # 如果没有 <br>,取全部
|
||||
|
||||
# 解析为纯文本(去除HTML标签,仅提取文本内容)
|
||||
performers_soup = BeautifulSoup(performers_html, "html.parser")
|
||||
performers_text = performers_soup.get_text()
|
||||
|
||||
# 提取表演者
|
||||
scene_performers = [p.strip() for p in performers_text.split(",")]
|
||||
|
||||
# 尝试获取 `webscene` 和 `studio`
|
||||
links_data = {}
|
||||
links = performer_info.find_all("a")
|
||||
if links:
|
||||
webscene_title = links[0].text.strip() if len(links)>0 else None
|
||||
webscene = links[0]["href"] if len(links)>0 else None
|
||||
studio = links[1].text.strip() if len(links)>1 else None
|
||||
studio_lnk = links[1]["href"] if len(links)>1 else None
|
||||
links_data = {
|
||||
"title": webscene_title,
|
||||
"webscene": webscene,
|
||||
"studio": studio,
|
||||
"studio_lnk": studio_lnk,
|
||||
}
|
||||
|
||||
scene_data = {
|
||||
"scene": scene,
|
||||
"performers": scene_performers,
|
||||
**links_data,
|
||||
}
|
||||
scene_breakdowns.append(scene_data)
|
||||
|
||||
appears_in = []
|
||||
appears_divs = soup.find("div", id="appearssection")
|
||||
if appears_divs:
|
||||
rows = appears_divs.find_all("li")
|
||||
for row in rows:
|
||||
lnk = row.find("a")
|
||||
if lnk:
|
||||
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
|
||||
|
||||
|
||||
return {
|
||||
"href": href,
|
||||
"title": title,
|
||||
"Minutes": movie_data.get("Minutes", ""),
|
||||
"Distributor": movie_data.get("Distributor", ""),
|
||||
"Studio": movie_data.get("Studio", ""),
|
||||
"ReleaseDate": movie_data.get("Release Date", ""),
|
||||
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||||
"All-Girl": movie_data.get("All-Girl", ""),
|
||||
"All-Male": movie_data.get("All-Male", ""),
|
||||
"Compilation": movie_data.get("Compilation", ""),
|
||||
"Webscene": movie_data.get("Webscene", ""),
|
||||
"Director": movie_data.get("Director", ""),
|
||||
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||
"StudioHref": movie_data.get("StudioHref", ""),
|
||||
"Directors": movie_data.get("Directors", []), # 可能存在的元素
|
||||
"Performers": performers,
|
||||
"SceneBreakdowns": scene_breakdowns,
|
||||
"AppearsIn": appears_in,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
for astro in astro_list:
|
||||
url = astr_base_url + astro
|
||||
next_url = url
|
||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||
|
||||
while True:
|
||||
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = parse_page_astro(soup, astro)
|
||||
if list_data:
|
||||
print(list_data[0] if len(list_data)>0 else 'no data')
|
||||
break
|
||||
else:
|
||||
logging.info(f"Retrying {next_url} ...")
|
||||
time.sleep(5) # 等待后再重试
|
||||
|
||||
time.sleep(2) # 控制访问频率
|
||||
@ -129,3 +129,11 @@ def replace_lang_param(url: str) -> str:
|
||||
)
|
||||
return urlunparse(new_parsed)
|
||||
|
||||
def pretty_json_simple(item):
|
||||
try:
|
||||
# 转换为单行JSON格式,需要保证传入的是map,不能是list
|
||||
return json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
|
||||
except:
|
||||
# 转换失败时返回原始字符串
|
||||
return item
|
||||
|
||||
Reference in New Issue
Block a user