This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/spiders/javdb_spider.py
2025-07-29 20:56:46 +08:00

338 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import scrapy
import re
import sys
from urllib.parse import urljoin, quote_plus
from scrapy_proj.spiders.base_spider import BaseSpider
from scrapy_proj.items import JavdbActorsAliasItem, JavdbActorsItem, JavdbActorsMoviesItem, JavdbMakersItem, JavdbMoviesItem, JavdbMoviesTagsItem, JavdbPublishersItem, JavdbSeriesItem, JavdbTagsItem
from scrapy_proj.db_wapper.spider_db_handler import JavDBHandler
from scrapy_proj.comm.comm_def import SPIDER_NAME_JAVDB
from scrapy_proj.spiders.parser.javdb_parser import common_parser
from scrapy_proj.utils.utils import pretty_json_simple, normalize_url, generate_multilang_urls, is_valid_url
db_tools = JavDBHandler()
class JavdbSpiderSpider(BaseSpider):
name = SPIDER_NAME_JAVDB
allowed_domains = ["javdb.com", "www.javdb.com"]
# 配置请求头复用curl中的头部信息
custom_settings = {
"DEFAULT_REQUEST_HEADERS": {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'priority': 'u=0, i',
'referer': 'https://javdb.com/',
'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0'
},
"COOKIES_ENABLED": True # 启用Cookie支持
}
host_url = "https://www.javdb.com"
def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.update_mode = True if mod and mod.lower() == 'update' else False
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
self.cmd_actors = 'actors'
self.cmd_movies = 'movies'
self.cmd_dist = 'dist_list'
self.cmd_list = [self.cmd_actors, self.cmd_movies, self.cmd_dist]
if cmd and cmd != '':
self.cmd_list = cmd.split(',')
self.existed_actors = {}
self.existed_movies = {}
self.load_existed_actors()
self.load_existed_movies()
self.requested_url = set()
# 入口函数,由基类的方法触发
def custom_start_requests(self):
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
# 根据命令字执行
if self.cmd_actors in self.cmd_list:
url = urljoin(self.host_url, "/actors/uncensored")
yield scrapy.Request(url,
callback=self.parser_actor_list,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
meta={'uncensored':1, 'from_actor_list':1, 'depth':1})
'''
url = urljoin(self.host_url, "/actors/censored")
yield scrapy.Request(url,
callback=self.parser_actor_list,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
meta={'uncensored':1, 'from_actor_list':1})
'''
# 演员列表页解析
def parser_actor_list(self, response):
uncensored = response.meta.get('uncensored', 1)
depth = response.meta.get('depth', 1)
if self.debug and depth>=3:
selef.logger.info(f'debug mode. stop next page. url: {response.url}')
return
data, next_url = common_parser(html=response.text, page='actor_list', href=response.url)
if data:
self.logger.info(f"fetched data from {response.url}, data count: {len(data)}")
for item in data:
url = item['href']
name = item['name']
# 更新对应语言的姓名
item = JavdbActorsItem()
item['href'] = url
item["name"] = name
yield item
# 发起查询详情,
if self._can_request(url) :
yield from self._create_performer_request(href=url, name=name, actor_url=url, depth=1)
if next_url:
yield scrapy.Request(next_url,
callback=self.parser_actor_list,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
meta={'uncensored':1, 'from_actor_list':1, 'depth':depth+1}
)
else:
self._handle_invalid_response(response, page='actor_list')
# 处理详细的解析页面
def parse_actor_detail_page(self, response):
actor_url = response.meta.get('actor_url', '')
actor_name = response.meta.get('actor_name', '')
depth = response.meta.get('depth', 1)
if self.debug and depth>=3:
selef.logger.info(f'debug mode. stop next page. url: {response.url}')
return
data, next_url = common_parser(html=response.text, page='actor', href=response.url)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
# 判断是否需要更新: 存在完整数据,且影片数量相同
movies_cnt = data.get('movies_cnt', 0)
if not self.need_update_actor(href=actor_url, movies_cnt=movies_cnt):
self.crawler.stats.inc_value(f"{self.name}/actor_done")
self.logger.info(f"actor ({actor_name}) up to date. movies cnt: {movies_cnt} skipping... url: {actor_url}")
return None
# 需要更新了,先翻页
if next_url:
yield from self._create_performer_request(href=next_url, name=actor_name, actor_url=actor_url, depth=depth+1)
else:
self.logger.info(f"actor ({actor_name}) read all pages. url :{response.url}")
self.crawler.stats.inc_value(f"{self.name}/actor_done")
self.add_actor_to_existed(href=actor_url, movies_cnt=movies_cnt)
# 更新详情数据
item = JavdbActorsItem()
item['href'] = actor_url
item['name'] = actor_name
item['from_actor_list'] = 1
item['movies_cnt'] = movies_cnt
item['avatar'] = data.get('avatar', {})
item['credits'] = data.get('movies', [])
for k, v in data.get('avatar', {}).items():
if k in item.fields:
item[k] = v
yield item
# 影片链接,判断是否需要发起
for item in data.get('movies', []):
yield from self._create_movie_request(href=item['href'], title=item['title'])
else:
self._handle_invalid_response(response, page='actor')
# 统一处理发起影片查询的请求
def _create_performer_request(self, href, name, actor_url, depth=1):
if href == '':
return
if is_valid_url(href):
if self._can_request(href):
self.crawler.stats.inc_value(f"{self.name}/actor_all")
yield scrapy.Request(href,
callback=self.parse_person_detail_page,
meta={'actor_name': name, 'actor_url': actor_url, 'item_type':'actor', 'depth':depth }
)
else:
self.logger.warning(f"wrong url. {href}, ignore...")
# 统一处理发起影片查询的请求
def _create_movie_request(self, href, title):
if href == '':
return
if is_valid_url(href):
if self.need_update_movie(href) and self._can_request(href):
self.crawler.stats.inc_value(f"{self.name}/movie_all")
yield scrapy.Request(href,
callback=self.parse_movie_detail_page,
meta={'title': title, 'item_type':'movie', 'cache':True}
)
else:
self.logger.warning(f"wrong url. {href}, ignore...")
# 统一处理发起影片查询的请求
def _create_movie_list_request(self, href, name, category, depth=1):
if href == '':
return
if is_valid_url(href):
yield scrapy.Request(href,
callback=self.parse_movie_list_page,
meta={'name': name, 'category':category, 'depth':depth}
)
else:
self.logger.warning(f"wrong url. {href}, ignore...")
def parse_movie_detail_page(self, response):
title = response.meta.get('title', '')
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
if data:
self.crawler.stats.inc_value(f"{self.name}/movie_done")
self.logger.debug(f"fetched data from {response.url}, data: {data}")
# 把movies信息入库
item = JavdbMoviesItem()
for k, v in data.items():
if k in item.fields:
item[k] = v
yield item
# 处理actors列表
for actor in data.get('actors', []):
yield from self._create_performer_request(href=actor['href'], name=actor['name'], actor_url=actor['href'], depth=1)
# 处理 maker
yield from self._create_movie_list_request(href=data['maker_link'], name=data['maker_name'], category='maker', depth=1)
# 处理 series
yield from self._create_movie_list_request(href=data['series_link'], name=data['series_name'], category='series', depth=1)
# 处理 pub
yield from self._create_movie_list_request(href=data['pub_link'], name=data['pub_name'], category='pub', depth=1)
else:
self._handle_invalid_response(response, page='movie')
# 处理 tags, studio, label, series 列表的公共函数
def parse_movie_list_page(self, response):
data, next_url = common_parser(html=response.text, page='movie_list', href=response.url)
category = response.meta.get('category', '')
name = response.meta.get('name', '')
depth = response.meta.get('depth', 1)
if self.debug and depth>=3:
self.logger.info(f"debug mode, stop next page. url: {response.url}")
return
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
# 根据 prefix 获取对应的 Item 类
ItemClass = ITEM_MAPPING.get(prefix)
if not ItemClass:
self.logger.warning(f"未找到 {prefix} 对应的 Item 类")
return None
# 影片链接,判断是否需要发起
for item in data:
yield from self._create_movie_request(href=item['href'], title=item['title'])
# 处理翻页
if next_url:
yield from self._create_movie_list_request(href=next_url, name=data['pub_name'], category='pub', depth=1)
yield scrapy.Request(next_url,
callback=self.parse_movie_list_page,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
meta=response.meta
)
else:
self.logger.info(f"movies list ({prefix}) read all pages. url :{response.url}")
else:
self._handle_invalid_response(response, page='movie_list')
# 统一判断并处理异常
def _handle_invalid_response(self, response, page=None):
if response.status in [200]:
if "404 Page Not Found" in response.text.lower():
self.logger.warning(f"404 Page Not Found. url: {response.url}, status_code: {response.status}")
else:
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
elif response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}")
else:
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
if page:
if page == 'actor':
item = JavbusActorsItem()
item['href'] = response.url
item['zh_name'] = response.meta.get('actor_name', '')
item['is_full_data'] = 404
yield item
elif page == 'movie' :
item = JavbusMoviesItem()
item['href'] = response.url
item['title'] = response.meta.get('title', '')
item['is_full_data'] = 404
yield item
# TODO: 表结构需要增加个 movies_cnt 字段
def load_existed_actors(self):
query_args = {}
rows = db_tools.query_actors(**query_args)
if rows:
for item in rows:
self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']}
else:
self.logger.warning(f"query_actors empty. query args: {query_args}")
def load_existed_movies(self):
query_args = {}
rows = db_tools.query_movies(**query_args)
if rows:
for item in rows:
self.existed_movies[item['href']] = item['is_full_data']
else:
self.logger.warning(f"query_movies empty. query args: {query_args}")
# 内存缓存也可以改为查询db
def need_update_movie(self, href):
return not (href in self.existed_movies and self.existed_movies[href] >0)
# 内存缓存也可以改为查询db
def need_update_actor(self, href, movies_cnt):
if href not in self.existed_actors:
return True
data = self.existed_actors[href]
if data['is_full_data'] <=0 :
return True
if data['movies_cnt'] < movies_cnt:
return True
return False
def add_actor_to_existed(self, href, movies_cnt, is_full_data=1):
self.existed_actors[href] = {'is_full_data': is_full_data, 'movies_cnt': movies_cnt}
def acc_movie_to_existed(self, href, is_full_data=1):
self.existed_movies[href] = is_full_data
def _can_request(self, href):
if href in self.requested_url:
return False
self.requested_url.add(href)
return True