modify scripts

This commit is contained in:
2025-07-27 16:03:59 +08:00
parent 364c39b8ea
commit 45271a5b23
8 changed files with 974 additions and 354 deletions

View File

@ -136,6 +136,7 @@ if [ "${PERIOD}" = "--monthly" ]; then
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
register_spider "javhd" "scrapy crawl javhd -a mod='update' "
register_spider "lord" "scrapy crawl lord -a mod='update' "
register_spider "javbus" "scrapy crawl javbus -a cmd='actors' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ "
fi

File diff suppressed because it is too large Load Diff

View File

@ -7,6 +7,7 @@ home_dir = os.path.expanduser("~")
global_share_data_dir = f'{home_dir}/sharedata'
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
shared_db_path = f"{global_share_data_dir}/sqlite/shared.db"
test_db_path = f"{global_share_data_dir}/sqlite/test.db"
# 单例元类
class SingletonMeta(type):

View File

@ -35,55 +35,6 @@ class Sis001Item(scrapy.Item):
size_gb = scrapy.Field()
update_date = scrapy.Field()
class IAFDPersonItem(scrapy.Item):
item_type = comm.ITEM_TYPE_ACTOR_INDEX
name = scrapy.Field()
href = scrapy.Field()
from_astro_list = scrapy.Field()
from_birth_list = scrapy.Field()
from_ethnic_list = scrapy.Field()
from_movie_list = scrapy.Field()
class IAFDMovieItem(scrapy.Item):
item_type = comm.ITEM_TYPE_MOVIE_INDEX
title = scrapy.Field()
href = scrapy.Field()
release_year = scrapy.Field()
from_performer_list = scrapy.Field()
from_dist_list = scrapy.Field()
from_stu_list = scrapy.Field()
class IAFDPersonDetailItem(scrapy.Item):
item_type = comm.ITEM_TYPE_ACTOR_DETAIL
href = scrapy.Field()
person = scrapy.Field()
gender = scrapy.Field()
birthday = scrapy.Field()
astrology = scrapy.Field()
birthplace = scrapy.Field()
years_active = scrapy.Field()
ethnicity = scrapy.Field()
nationality = scrapy.Field()
hair_colors = scrapy.Field()
eye_color = scrapy.Field()
height = scrapy.Field()
weight = scrapy.Field()
measurements = scrapy.Field()
tattoos = scrapy.Field()
piercings = scrapy.Field()
movies_cnt = scrapy.Field()
vixen_cnt = scrapy.Field()
blacked_cnt = scrapy.Field()
tushy_cnt = scrapy.Field()
x_art_cnt = scrapy.Field()
performer_aka = scrapy.Field()
class IAFDMovieDetailItem(scrapy.Item):
item_type = comm.ITEM_TYPE_MOVIE_DETAIL
title = scrapy.Field()
href = scrapy.Field()
# 可以根据实际需求添加更多影片详情字段
class PBoxStuItem(scrapy.Item):
item_type = scrapy.Field()
label_id = scrapy.Field()
@ -228,10 +179,12 @@ class IafdDistributorsItem(scrapy.Item):
href = scrapy.Field()
parent_id = scrapy.Field()
details = scrapy.Field()
# 以下为添加字段
class IafdMetaEthnicItem(scrapy.Item):
name = scrapy.Field()
href = scrapy.Field()
# 以下为添加字段
class IafdMoviesItem(scrapy.Item):
title = scrapy.Field()
@ -251,21 +204,35 @@ class IafdMoviesItem(scrapy.Item):
from_performer_list = scrapy.Field()
from_dist_list = scrapy.Field()
from_stu_list = scrapy.Field()
# 以下为添加字段
Directors = scrapy.Field()
Distributor = scrapy.Field()
DistributorHref = scrapy.Field()
Studio = scrapy.Field()
StudioHref = scrapy.Field()
Director = scrapy.Field()
DirectorHref = scrapy.Field()
Performers = scrapy.Field()
SceneBreakdowns = scrapy.Field()
AppearsIn = scrapy.Field()
class IafdMoviesAppersInItem(scrapy.Item):
movie_id = scrapy.Field()
appears_in_id = scrapy.Field()
gradation = scrapy.Field()
notes = scrapy.Field()
# 以下为添加字段
class IafdPerformerAliasesItem(scrapy.Item):
performer_id = scrapy.Field()
alias = scrapy.Field()
# 以下为添加字段
class IafdPerformerUrlsItem(scrapy.Item):
performer_id = scrapy.Field()
position = scrapy.Field()
url = scrapy.Field()
# 以下为添加字段
class IafdPerformersItem(scrapy.Item):
name = scrapy.Field()
@ -299,18 +266,23 @@ class IafdPerformersItem(scrapy.Item):
from_birth_list = scrapy.Field()
from_ethnic_list = scrapy.Field()
from_movie_list = scrapy.Field()
# 以下为添加字段
credits = scrapy.Field()
performer_aka = scrapy.Field()
class IafdPerformersMoviesItem(scrapy.Item):
performer_id = scrapy.Field()
movie_id = scrapy.Field()
role = scrapy.Field()
notes = scrapy.Field()
# 以下为添加字段
class IafdStudiosItem(scrapy.Item):
name = scrapy.Field()
href = scrapy.Field()
parent_id = scrapy.Field()
details = scrapy.Field()
# 以下为添加字段
class IafdTaskLogItem(scrapy.Item):
task_id = scrapy.Field()
@ -321,6 +293,7 @@ class IafdTaskLogItem(scrapy.Item):
total_distributors = scrapy.Field()
total_studios = scrapy.Field()
task_status = scrapy.Field()
# 以下为添加字段
class JavbusActorsItem(scrapy.Item):
ja_name = scrapy.Field()

View File

@ -11,7 +11,7 @@
# return item
import json
import scrapy
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem, PBoxStuItem
from scrapy_proj.items import U001Item, Sis001Item, PBoxStuItem
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler, PboxDBHandler
class SQLitePipeline():

View File

@ -6,6 +6,16 @@ from twisted.internet import reactor, defer, asyncioreactor
import time
class BaseSpider(scrapy.Spider):
def __init__(self, *args, **kwargs):
self.requested_url = set()
# 记录本次任务已经发起的请求链接
def _can_request(self, href):
if href in self.requested_url:
return False
self.requested_url.add(href)
return True
def start_requests(self):
"""统一处理请求生成,兼容不同入口点"""
# 如果定义了async start方法使用它

View File

@ -3,11 +3,11 @@ import re
import sys
from urllib.parse import urljoin, quote_plus
from scrapy_proj.spiders.base_spider import BaseSpider
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
from scrapy_proj.items import IafdDistributorsItem, IafdMetaEthnicItem, IafdMoviesItem, IafdPerformersItem, IafdStudiosItem
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
from scrapy_proj.spiders.parser.iafd_parser import common_parser
from scrapy_proj.utils.utils import pretty_json_simple
from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url
db_tools = IAFDDBHandler()
@ -40,8 +40,19 @@ class IAFDSpider(BaseSpider):
if cmd and cmd != '':
self.cmd_list = cmd.split(',')
self.existed_actors = {}
self.existed_movies = {}
self.load_existed_actors()
self.load_existed_movies()
# 入口函数,由基类的方法触发
def custom_start_requests(self):
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
self.crawler.stats.set_value(f"{self.name}/actor_404", 0)
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
self.crawler.stats.set_value(f"{self.name}/movie_404", 0)
# 根据命令字执行
if self.cmd_astro in self.cmd_list:
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
@ -117,59 +128,65 @@ class IAFDSpider(BaseSpider):
async for request in super().start():
yield request
# 获得列表,查询详情
def parse_astro_page(self, response):
astro = response.meta.get('astro', '')
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
else:
self.logger.warning(f"parse data error. {response.url}")
item = IAFDPersonDetailItem()
#yield item
# 获得列表,查询详情
def parse_birth_page(self, response):
month = response.meta['month']
day = response.meta['day']
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
else:
self.logger.warning(f"parse data error. {response.url}")
item = IAFDPersonDetailItem()
#yield item
# 获得列表,查询详情
def parse_ethnic_list_page(self, response):
div_root = response.css('select#ethnicity1')
if div_root:
options = div_root.css('option')
self.crawler.stats.set_value(f"{self.name}/ethnic_all", len(options))
self.crawler.stats.set_value(f"{self.name}/ethnic_done", 0)
for option in options:
href = option.attrib.get('value')
text = option.css('::text').get().strip()
if href and href.lower() != 'none':
ethnic_url = urljoin(response.url , href)
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
item = IafdMetaEthnicItem()
item['name'] = text
item['href'] = ethnic_url
yield item
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
else:
self.logger.warning(f"parse page error. url: {response.url}")
# 获得列表,查询详情
def parse_ethnic_page(self, response):
ethnic = response.meta['ethnic']
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
if next_url:
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
else:
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
else:
self.logger.warning(f"parse data error. {response.url}")
if next_url:
self.logger.info(f"find next page: {next_url}")
else:
self.logger.info(f"found all pages. url: {response.url}")
item = IAFDPersonDetailItem()
#yield item
def parse_distributors_list_page(self, response):
select_element = response.css('select[name="Distrib"]')
if select_element:
@ -178,7 +195,15 @@ class IAFDSpider(BaseSpider):
value = option.attrib.get('value')
text = option.css('::text').get().strip()
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
item = IafdDistributorsItem()
item['name'] = text
item['href'] = dis_url
yield item
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
else:
self.logger.warning(f"parse page error. url: {response.url}")
def parse_studios_list_page(self, response):
select_element = response.css('select[name="Studio"]')
@ -188,53 +213,156 @@ class IAFDSpider(BaseSpider):
value = option.attrib.get('value')
text = option.css('::text').get().strip()
dis_url = f"{self.host_url}/studio.rme/studio={value}"
item = IafdStudiosItem()
item['name'] = text
item['href'] = dis_url
yield item
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
else:
self.logger.warning(f"parse page error. url: {response.url}")
def parse_stu_dist_page(self, response):
list_type = response.meta.get('list_type', '')
data, next_url = common_parser(html=response.text, page=list_type)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
for movie in data:
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
else:
self.logger.warning(f"fetched data error. {response.url}")
item = IAFDPersonDetailItem()
#yield item
# 统一处理发起影片查询的请求
def _create_performer_request(self, href, name):
if href != '' and is_valid_url(href):
if self._can_request(href):
self.crawler.stats.inc_value(f"{self.name}/actor_all")
yield scrapy.Request(href,
callback=self.parse_person_detail_page,
meta={'name': name, 'item_type':'movie'}
)
else:
self.logger.warning(f"wrong url. {url}, ignore...")
# 统一处理发起影片查询的请求
def _create_movie_request(self, href, title):
if href != '' and is_valid_url(href):
if self.need_update_movie(href) and self._can_request(href):
self.crawler.stats.inc_value(f"{self.name}/movie_all")
yield scrapy.Request(href,
callback=self.parse_movie_detail_page,
meta={'title': title, 'item_type':'movie'},
cache=True
)
else:
self.logger.warning(f"wrong url. {url}, ignore...")
# 演员详情页解析和处理
def parse_person_detail_page(self, response):
data = common_parser(html=response.text, page='actor', url=response.url)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.crawler.stats.inc_value(f"{self.name}/actor_done")
item = IafdPerformersItem()
for k, v in data.items():
if k in item.fields:
item[k] = v
yield item
# 处理影片列表
for role, movies in data.get('credits', {}).items():
if movies:
for item in movies:
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
else:
self.logger.warning(f"fetched data error. {response.url}")
item = IAFDPersonDetailItem()
#yield item
# 影片详情页解析和处理
def parse_movie_detail_page(self, response):
title = response.meta.get('title', '')
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.crawler.stats.inc_value(f"{self.name}/movie_done")
item = IafdMoviesItem()
for k, v in data.items():
if k in item.fields:
item[k] = v
yield item
# 处理各种链接
link_url = data.get('DistributorHref', '')
if is_valid_url(link_url) and self._can_request(link_url):
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
link_url = data.get('StudioHref', '')
if is_valid_url(link_url) and self._can_request(link_url):
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
link_url = data.get('DirectorHref', '')
yield from self._create_performer_request(href=link_url, name=data.get('Director'))
for director in data.get('Directors', []):
yield from self._create_performer_request(href=director['href'], name=director['name'])
else:
self.logger.warning(f"fetched data error. {response.url}")
item = IAFDMovieDetailItem()
#yield item
def custom_block_check(self, response):
item_type = response.meta.get('item_type', '')
if "invalid or outdated page" in response.text.lower():
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
return "invalid or outdated page"
else:
self.logger.info(f"right content. url: {response.url}")
return None
# 统一判断并处理异常
def _handle_invalid_response(self, response):
if response.status in [200]:
if "invalid or outdated page" in response.text.lower():
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
# TODO: 更新404的演员或者影片
else:
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
# 处理页面异常主要是404, 403
def handle_blocked(self, response, reason):
item_type = response.meta.get('item_type', '')
if response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
elif response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}")
# TODO: 更新404的演员或者影片
else:
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
def load_existed_actors(self):
query_args = {}
rows = db_tools.query_performer_hrefs(**query_args)
if rows:
for item in rows:
self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']}
else:
self.logger.warning(f"query_performer_hrefs empty. query args: {query_args}")
def load_existed_movies(self):
query_args = {}
rows = db_tools.query_movie_hrefs(**query_args)
if rows:
for item in rows:
self.existed_movies[item['href']] = item['is_full_data']
else:
self.logger.warning(f"query_movies empty. query args: {query_args}")
# 内存缓存也可以改为查询db
def need_update_movie(self, href):
return not (href in self.existed_movies and self.existed_movies[href] >0)
# 内存缓存也可以改为查询db
def need_update_actor(self, href, movies_cnt):
if href not in self.existed_actors:
return True
data = self.existed_actors[href]
if data['is_full_data'] <=0 :
return True
if data['movies_cnt'] < movies_cnt:
return True
return False
def add_actor_to_existed(self, href, movies_cnt, is_full_data=1):
self.existed_actors[href] = {'is_full_data': is_full_data, 'movies_cnt': movies_cnt}
def acc_movie_to_existed(self, href, is_full_data=1):
self.existed_movies[href] = is_full_data

View File

@ -1,4 +1,3 @@
import cloudscraper
import time
import json
@ -11,6 +10,7 @@ import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
from datetime import datetime
#import config
#import utils
@ -410,8 +410,8 @@ def parse_page_performer(soup, url):
'nationality': 'Nationality',
'hair_colors': 'Hair Colors',
'eye_color': 'Eye Color',
'height': 'Height',
'weight': 'Weight',
'height_str': 'Height',
'weight_str': 'Weight',
'measurements': 'Measurements',
'tattoos': 'Tattoos',
'piercings': 'Piercings'
@ -474,6 +474,20 @@ def parse_page_performer(soup, url):
return data
def extract_year_from_date_string(date_str):
"""
"Apr 23, 2021" 格式的字符串中提取年份
:param date_str: 待解析的日期字符串(如 "Apr 23, 2021"
:return: 提取的年份int类型若解析失败返回 None
"""
try:
date_obj = datetime.strptime(date_str, "%b %d, %Y")
return date_obj.year
except ValueError:
return 0
except TypeError:
return 0
# 解析网页 HTML 并提取电影信息
def parse_page_movie(soup, href, title):
@ -595,19 +609,20 @@ def parse_page_movie(soup, href, title):
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"minutes": movie_data.get("Minutes", ""),
"release_date": movie_data.get("Release Date", ""),
"added_to_IAFD_date": movie_data.get("Date Added to IAFD", ""),
"all_girl": movie_data.get("All-Girl", ""),
"all_male": movie_data.get("All-Male", ""),
"compilation": movie_data.get("Compilation", ""),
"webscene": movie_data.get("Webscene", ""),
'release_year': extract_year_from_date_string(movie_data.get("Release Date", "")),
"Director": movie_data.get("Director", ""),
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"Studio": movie_data.get("Studio", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Distributor": movie_data.get("Distributor", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"Directors": movie_data.get("Directors", []), # 可能存在的元素
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,