338 lines
15 KiB
Python
338 lines
15 KiB
Python
import scrapy
|
||
import re
|
||
import sys
|
||
from urllib.parse import urljoin, quote_plus
|
||
from scrapy_proj.spiders.base_spider import BaseSpider
|
||
from scrapy_proj.items import JavdbActorsAliasItem, JavdbActorsItem, JavdbActorsMoviesItem, JavdbMakersItem, JavdbMoviesItem, JavdbMoviesTagsItem, JavdbPublishersItem, JavdbSeriesItem, JavdbTagsItem
|
||
from scrapy_proj.db_wapper.spider_db_handler import JavDBHandler
|
||
from scrapy_proj.comm.comm_def import SPIDER_NAME_JAVDB
|
||
from scrapy_proj.spiders.parser.javdb_parser import common_parser
|
||
from scrapy_proj.utils.utils import pretty_json_simple, normalize_url, generate_multilang_urls, is_valid_url
|
||
|
||
db_tools = JavDBHandler()
|
||
|
||
class JavdbSpiderSpider(BaseSpider):
|
||
name = SPIDER_NAME_JAVDB
|
||
allowed_domains = ["javdb.com", "www.javdb.com"]
|
||
|
||
# 配置请求头(复用curl中的头部信息)
|
||
custom_settings = {
|
||
"DEFAULT_REQUEST_HEADERS": {
|
||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||
'priority': 'u=0, i',
|
||
'referer': 'https://javdb.com/',
|
||
'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"',
|
||
'sec-ch-ua-mobile': '?0',
|
||
'sec-ch-ua-platform': '"macOS"',
|
||
'sec-fetch-dest': 'document',
|
||
'sec-fetch-mode': 'navigate',
|
||
'sec-fetch-site': 'same-origin',
|
||
'sec-fetch-user': '?1',
|
||
'upgrade-insecure-requests': '1',
|
||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0'
|
||
},
|
||
"COOKIES_ENABLED": True # 启用Cookie支持
|
||
}
|
||
|
||
host_url = "https://www.javdb.com"
|
||
|
||
def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
|
||
super().__init__(*args, **kwargs)
|
||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||
self.update_mode = True if mod and mod.lower() == 'update' else False
|
||
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||
|
||
self.cmd_actors = 'actors'
|
||
self.cmd_movies = 'movies'
|
||
self.cmd_dist = 'dist_list'
|
||
self.cmd_list = [self.cmd_actors, self.cmd_movies, self.cmd_dist]
|
||
if cmd and cmd != '':
|
||
self.cmd_list = cmd.split(',')
|
||
|
||
self.existed_actors = {}
|
||
self.existed_movies = {}
|
||
self.load_existed_actors()
|
||
self.load_existed_movies()
|
||
|
||
self.requested_url = set()
|
||
|
||
# 入口函数,由基类的方法触发
|
||
def custom_start_requests(self):
|
||
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
|
||
# 根据命令字执行
|
||
if self.cmd_actors in self.cmd_list:
|
||
url = urljoin(self.host_url, "/actors/uncensored")
|
||
yield scrapy.Request(url,
|
||
callback=self.parser_actor_list,
|
||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||
meta={'uncensored':1, 'from_actor_list':1, 'depth':1})
|
||
|
||
'''
|
||
url = urljoin(self.host_url, "/actors/censored")
|
||
yield scrapy.Request(url,
|
||
callback=self.parser_actor_list,
|
||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||
meta={'uncensored':1, 'from_actor_list':1})
|
||
'''
|
||
|
||
# 演员列表页解析
|
||
def parser_actor_list(self, response):
|
||
uncensored = response.meta.get('uncensored', 1)
|
||
depth = response.meta.get('depth', 1)
|
||
if self.debug and depth>=3:
|
||
selef.logger.info(f'debug mode. stop next page. url: {response.url}')
|
||
return
|
||
data, next_url = common_parser(html=response.text, page='actor_list', href=response.url)
|
||
if data:
|
||
self.logger.info(f"fetched data from {response.url}, data count: {len(data)}")
|
||
for item in data:
|
||
url = item['href']
|
||
name = item['name']
|
||
# 更新对应语言的姓名
|
||
item = JavdbActorsItem()
|
||
item['href'] = url
|
||
item["name"] = name
|
||
yield item
|
||
|
||
# 发起查询详情,
|
||
if self._can_request(url) :
|
||
yield from self._create_performer_request(href=url, name=name, actor_url=url, depth=1)
|
||
|
||
if next_url:
|
||
yield scrapy.Request(next_url,
|
||
callback=self.parser_actor_list,
|
||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||
meta={'uncensored':1, 'from_actor_list':1, 'depth':depth+1}
|
||
)
|
||
else:
|
||
self._handle_invalid_response(response, page='actor_list')
|
||
|
||
# 处理详细的解析页面
|
||
def parse_actor_detail_page(self, response):
|
||
actor_url = response.meta.get('actor_url', '')
|
||
actor_name = response.meta.get('actor_name', '')
|
||
depth = response.meta.get('depth', 1)
|
||
if self.debug and depth>=3:
|
||
selef.logger.info(f'debug mode. stop next page. url: {response.url}')
|
||
return
|
||
data, next_url = common_parser(html=response.text, page='actor', href=response.url)
|
||
if data:
|
||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||
|
||
# 判断是否需要更新: 存在完整数据,且影片数量相同
|
||
movies_cnt = data.get('movies_cnt', 0)
|
||
if not self.need_update_actor(href=actor_url, movies_cnt=movies_cnt):
|
||
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||
self.logger.info(f"actor ({actor_name}) up to date. movies cnt: {movies_cnt} skipping... url: {actor_url}")
|
||
return None
|
||
|
||
# 需要更新了,先翻页
|
||
if next_url:
|
||
yield from self._create_performer_request(href=next_url, name=actor_name, actor_url=actor_url, depth=depth+1)
|
||
else:
|
||
self.logger.info(f"actor ({actor_name}) read all pages. url :{response.url}")
|
||
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||
self.add_actor_to_existed(href=actor_url, movies_cnt=movies_cnt)
|
||
|
||
# 更新详情数据
|
||
item = JavdbActorsItem()
|
||
item['href'] = actor_url
|
||
item['name'] = actor_name
|
||
item['from_actor_list'] = 1
|
||
item['movies_cnt'] = movies_cnt
|
||
item['avatar'] = data.get('avatar', {})
|
||
item['credits'] = data.get('movies', [])
|
||
for k, v in data.get('avatar', {}).items():
|
||
if k in item.fields:
|
||
item[k] = v
|
||
yield item
|
||
|
||
# 影片链接,判断是否需要发起
|
||
for item in data.get('movies', []):
|
||
yield from self._create_movie_request(href=item['href'], title=item['title'])
|
||
else:
|
||
self._handle_invalid_response(response, page='actor')
|
||
|
||
|
||
# 统一处理发起影片查询的请求
|
||
def _create_performer_request(self, href, name, actor_url, depth=1):
|
||
if href == '':
|
||
return
|
||
if is_valid_url(href):
|
||
if self._can_request(href):
|
||
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
||
yield scrapy.Request(href,
|
||
callback=self.parse_person_detail_page,
|
||
meta={'actor_name': name, 'actor_url': actor_url, 'item_type':'actor', 'depth':depth }
|
||
)
|
||
else:
|
||
self.logger.warning(f"wrong url. {href}, ignore...")
|
||
|
||
# 统一处理发起影片查询的请求
|
||
def _create_movie_request(self, href, title):
|
||
if href == '':
|
||
return
|
||
if is_valid_url(href):
|
||
if self.need_update_movie(href) and self._can_request(href):
|
||
self.crawler.stats.inc_value(f"{self.name}/movie_all")
|
||
yield scrapy.Request(href,
|
||
callback=self.parse_movie_detail_page,
|
||
meta={'title': title, 'item_type':'movie', 'cache':True}
|
||
)
|
||
else:
|
||
self.logger.warning(f"wrong url. {href}, ignore...")
|
||
|
||
# 统一处理发起影片查询的请求
|
||
def _create_movie_list_request(self, href, name, category, depth=1):
|
||
if href == '':
|
||
return
|
||
if is_valid_url(href):
|
||
yield scrapy.Request(href,
|
||
callback=self.parse_movie_list_page,
|
||
meta={'name': name, 'category':category, 'depth':depth}
|
||
)
|
||
else:
|
||
self.logger.warning(f"wrong url. {href}, ignore...")
|
||
|
||
def parse_movie_detail_page(self, response):
|
||
title = response.meta.get('title', '')
|
||
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
|
||
if data:
|
||
self.crawler.stats.inc_value(f"{self.name}/movie_done")
|
||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||
# 把movies信息入库
|
||
item = JavdbMoviesItem()
|
||
for k, v in data.items():
|
||
if k in item.fields:
|
||
item[k] = v
|
||
yield item
|
||
|
||
# 处理actors列表
|
||
for actor in data.get('actors', []):
|
||
yield from self._create_performer_request(href=actor['href'], name=actor['name'], actor_url=actor['href'], depth=1)
|
||
|
||
# 处理 maker
|
||
yield from self._create_movie_list_request(href=data['maker_link'], name=data['maker_name'], category='maker', depth=1)
|
||
|
||
# 处理 series
|
||
yield from self._create_movie_list_request(href=data['series_link'], name=data['series_name'], category='series', depth=1)
|
||
|
||
# 处理 pub
|
||
yield from self._create_movie_list_request(href=data['pub_link'], name=data['pub_name'], category='pub', depth=1)
|
||
|
||
else:
|
||
self._handle_invalid_response(response, page='movie')
|
||
|
||
# 处理 tags, studio, label, series 列表的公共函数
|
||
def parse_movie_list_page(self, response):
|
||
data, next_url = common_parser(html=response.text, page='movie_list', href=response.url)
|
||
category = response.meta.get('category', '')
|
||
name = response.meta.get('name', '')
|
||
depth = response.meta.get('depth', 1)
|
||
if self.debug and depth>=3:
|
||
self.logger.info(f"debug mode, stop next page. url: {response.url}")
|
||
return
|
||
|
||
if data:
|
||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||
# 根据 prefix 获取对应的 Item 类
|
||
ItemClass = ITEM_MAPPING.get(prefix)
|
||
if not ItemClass:
|
||
self.logger.warning(f"未找到 {prefix} 对应的 Item 类")
|
||
return None
|
||
|
||
# 影片链接,判断是否需要发起
|
||
for item in data:
|
||
yield from self._create_movie_request(href=item['href'], title=item['title'])
|
||
|
||
# 处理翻页
|
||
if next_url:
|
||
yield from self._create_movie_list_request(href=next_url, name=data['pub_name'], category='pub', depth=1)
|
||
yield scrapy.Request(next_url,
|
||
callback=self.parse_movie_list_page,
|
||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||
meta=response.meta
|
||
)
|
||
else:
|
||
self.logger.info(f"movies list ({prefix}) read all pages. url :{response.url}")
|
||
else:
|
||
self._handle_invalid_response(response, page='movie_list')
|
||
|
||
|
||
# 统一判断并处理异常
|
||
def _handle_invalid_response(self, response, page=None):
|
||
if response.status in [200]:
|
||
if "404 Page Not Found" in response.text.lower():
|
||
self.logger.warning(f"404 Page Not Found. url: {response.url}, status_code: {response.status}")
|
||
else:
|
||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||
elif response.status in [404, 403]:
|
||
self.logger.warning(f"get 404 page. url: {response.url}")
|
||
else:
|
||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||
|
||
if page:
|
||
if page == 'actor':
|
||
item = JavbusActorsItem()
|
||
item['href'] = response.url
|
||
item['zh_name'] = response.meta.get('actor_name', '')
|
||
item['is_full_data'] = 404
|
||
yield item
|
||
elif page == 'movie' :
|
||
item = JavbusMoviesItem()
|
||
item['href'] = response.url
|
||
item['title'] = response.meta.get('title', '')
|
||
item['is_full_data'] = 404
|
||
yield item
|
||
|
||
# TODO: 表结构需要增加个 movies_cnt 字段
|
||
def load_existed_actors(self):
|
||
query_args = {}
|
||
rows = db_tools.query_actors(**query_args)
|
||
if rows:
|
||
for item in rows:
|
||
self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']}
|
||
else:
|
||
self.logger.warning(f"query_actors empty. query args: {query_args}")
|
||
|
||
|
||
def load_existed_movies(self):
|
||
query_args = {}
|
||
rows = db_tools.query_movies(**query_args)
|
||
if rows:
|
||
for item in rows:
|
||
self.existed_movies[item['href']] = item['is_full_data']
|
||
else:
|
||
self.logger.warning(f"query_movies empty. query args: {query_args}")
|
||
|
||
# 内存缓存,也可以改为查询db
|
||
def need_update_movie(self, href):
|
||
return not (href in self.existed_movies and self.existed_movies[href] >0)
|
||
|
||
# 内存缓存,也可以改为查询db
|
||
def need_update_actor(self, href, movies_cnt):
|
||
if href not in self.existed_actors:
|
||
return True
|
||
data = self.existed_actors[href]
|
||
if data['is_full_data'] <=0 :
|
||
return True
|
||
if data['movies_cnt'] < movies_cnt:
|
||
return True
|
||
|
||
return False
|
||
|
||
def add_actor_to_existed(self, href, movies_cnt, is_full_data=1):
|
||
self.existed_actors[href] = {'is_full_data': is_full_data, 'movies_cnt': movies_cnt}
|
||
|
||
def acc_movie_to_existed(self, href, is_full_data=1):
|
||
self.existed_movies[href] = is_full_data
|
||
|
||
def _can_request(self, href):
|
||
if href in self.requested_url:
|
||
return False
|
||
self.requested_url.add(href)
|
||
return True |