From cc6530d73a3a2bc220a033043952be732035c0b9 Mon Sep 17 00:00:00 2001 From: sophon Date: Wed, 23 Jul 2025 23:18:21 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/cron/cron_scheduler.sh | 37 ++++ scrapy_proj/scrapy_proj/comm/comm_def.py | 14 +- .../db_wapper/spider_db_handler.py | 49 ++++- .../scrapy_proj/extensions/stats_extension.py | 2 +- scrapy_proj/scrapy_proj/items.py | 30 +++ .../scrapy_proj/spiders/iafd_spider.py | 27 ++- .../scrapy_proj/spiders/javhd_spider.py | 191 ++++++++++++++++++ scrapy_proj/scrapy_proj/utils/utils.py | 29 ++- 8 files changed, 363 insertions(+), 16 deletions(-) create mode 100644 scrapy_proj/scrapy_proj/spiders/javhd_spider.py diff --git a/scrapy_proj/cron/cron_scheduler.sh b/scrapy_proj/cron/cron_scheduler.sh index 7462108..b027e3b 100755 --- a/scrapy_proj/cron/cron_scheduler.sh +++ b/scrapy_proj/cron/cron_scheduler.sh @@ -8,6 +8,7 @@ export PATH="/home/ubuntu/.local/bin:$PATH" # 项目基础路径 SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +GIT_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../" && pwd) LOG_DIR="${SCRAPY_PROJ_DIR}/log" mkdir -p "${LOG_DIR}" # 确保日志目录存在(锁文件依赖此目录) SLEEP_SECONDS=60 @@ -62,6 +63,34 @@ release_lock() { trap release_lock EXIT +# ============================================== +# Git操作:拉取最新代码 +# ============================================== +# 功能:执行git pull并检查结果 +# 参数:1. 项目目录 2. 日志函数(可选) +# 返回值:0=成功,1=失败 +git_pull() { + local repo_dir="$1" + local log_func="${2:-echo}" # 允许传入日志函数 + + if [ ! -d "${repo_dir}/.git" ]; then + $log_func "ERROR: 目录${repo_dir}不是Git仓库,无法执行git pull" + return 1 + fi + + $log_func "开始执行git pull更新代码..." + local pull_output=$(cd "${repo_dir}" && git pull 2>&1) + local exit_code=$? + + if [ ${exit_code} -eq 0 ]; then + $log_func "git pull成功:${pull_output}" + return 0 + else + $log_func "ERROR: git pull失败(退出码${exit_code}):${pull_output}" + return 1 + fi +} + # ============================================== # 参数解析:区分执行周期(每周/每月) # ============================================== @@ -105,6 +134,7 @@ fi # 每月任务 if [ "${PERIOD}" = "--monthly" ]; then register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' " + register_spider "pbox" "scrapy crawl javhd -a mod='update' " fi @@ -157,6 +187,13 @@ if ! acquire_lock; then fi log "成功获取执行锁,开始执行任务" +# 拉取最新代码(关键步骤:失败则终止执行) +if ! git_pull "${GIT_PROJ_DIR}" log; then + log "ERROR: 代码更新失败,终止后续执行" + exit 1 +fi + + # 第二步:检查注册任务 if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then log "ERROR: 未注册任何${PERIOD#--}Spider,脚本终止" diff --git a/scrapy_proj/scrapy_proj/comm/comm_def.py b/scrapy_proj/scrapy_proj/comm/comm_def.py index 6532a71..d1a9e27 100644 --- a/scrapy_proj/scrapy_proj/comm/comm_def.py +++ b/scrapy_proj/scrapy_proj/comm/comm_def.py @@ -6,11 +6,15 @@ # -SPIDER_NAME_SIS = 'sis' -SPIDER_NAME_U3C3 = 'u3c3' -SPIDER_NAME_IAFD = 'iafd' -SPIDER_NAME_PBOX = 'pbox' -SPIDER_NAME_CLM = 'clm' +SPIDER_NAME_SIS = 'sis' +SPIDER_NAME_U3C3 = 'u3c3' +SPIDER_NAME_CLM = 'clm' +SPIDER_NAME_IAFD = 'iafd' +SPIDER_NAME_PBOX = 'pbox' +SPIDER_NAME_JAVHD = 'javhd' +SPIDER_NAME_JAVDB = 'javdb' +SPIDER_NAME_JAVBUS = 'javbus' +SPIDER_NAME_LORD = 'lord' ITEM_TYPE_LIST = 'list' ITEM_TYPE_STUDIO = 'studio' diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index 2f8ceae..7d0cddf 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -561,4 +561,51 @@ class PboxDBHandler(SQLiteDBHandler): def close_spider(self, spider): # 关闭数据库连接 - self.conn.close() \ No newline at end of file + self.conn.close() + + + +@register_handler(comm.SPIDER_NAME_JAVHD) +class JavHDDBHandler(SQLiteDBHandler): + def __init__(self, db_path=shared_db_path): + super().__init__(db_path) + self.tbl_name_javhd = 'javhd_models' + + def insert_item(self, item): + if item['item_type'] == comm.ITEM_TYPE_ACTOR_INDEX: + self.insert_or_update_common(item, self.tbl_name_javhd, uniq_key='url', exists_do_nothing=False) + elif item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL: + self.insert_or_update_common(item, self.tbl_name_javhd, uniq_key='url', exists_do_nothing=False) + else: + logging.error(f"unkown item.") + + return item + + # 统计函数 + def get_stat(self): + try: + self.cursor.execute(f""" + SELECT + (SELECT COUNT(*) FROM {self.tbl_name_javhd}) AS cnt + """) + + row = self.cursor.fetchone() + if not row: + logging.warning(f"query no results.") + return {} + + columns = [desc[0] for desc in self.cursor.description] + return dict(zip(columns, row)) + + except sqlite3.Error as e: + logging.error(f"query error: {e}") + return {} + + def has_full_data(self, href): + try: + self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_javhd} WHERE is_full_data=1 and url = ?", (href,)) + row = self.cursor.fetchone() + return row[0] if row else None + except sqlite3.Error as e: + logging.error(f"query error: {e}") + return 0 diff --git a/scrapy_proj/scrapy_proj/extensions/stats_extension.py b/scrapy_proj/scrapy_proj/extensions/stats_extension.py index 1a3a41a..9b812b1 100644 --- a/scrapy_proj/scrapy_proj/extensions/stats_extension.py +++ b/scrapy_proj/scrapy_proj/extensions/stats_extension.py @@ -77,7 +77,7 @@ class StatsExtension: # 获取当前统计信息 stats = self.stats.get_stats() # 获取spider自定义的信息 - spider_stat = {'sp': '-------'} + spider_stat = {'task': '-------'} prefix = f"{self.spider_name}/" for key, value in stats.items(): if key.startswith(prefix): diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index a42ebfa..c922eed 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -163,3 +163,33 @@ class ClmKeywordsIndexItem(scrapy.Item): index_id = scrapy.Field() wid_iid = scrapy.Field() tags = scrapy.Field() + +class JavHDActorIndexItem(scrapy.Item): + item_type = scrapy.Field() + rank = scrapy.Field() + ja_name = scrapy.Field() + zh_name = scrapy.Field() + en_name = scrapy.Field() + url = scrapy.Field() + pic = scrapy.Field() + is_full_data = scrapy.Field() + + +class JavHDActorItem(scrapy.Item): + item_type = scrapy.Field() + rank = scrapy.Field() + ja_name = scrapy.Field() + zh_name = scrapy.Field() + en_name = scrapy.Field() + url = scrapy.Field() + pic = scrapy.Field() + height = scrapy.Field() + weight = scrapy.Field() + breast_size = scrapy.Field() + breast_factor = scrapy.Field() + hair_color = scrapy.Field() + eye_color = scrapy.Field() + birth_date = scrapy.Field() + ethnicity = scrapy.Field() + birth_place = scrapy.Field() + is_full_data = scrapy.Field() \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py index 0041194..9ea1bf8 100644 --- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py @@ -22,9 +22,8 @@ class IAFDSpider(BaseSpider): def __init__(self, debug='false', cmd='', update='0', *args, **kwargs): super().__init__(*args, **kwargs) self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False - self.cmd_str = cmd self.update = int(update) - self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}") + self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") self.cmd_astro = 'astro' self.cmd_birth = 'birth' @@ -33,25 +32,28 @@ class IAFDSpider(BaseSpider): self.cmd_stu = 'stu' self.cmd_performers = 'performers' self.cmd_movies = 'movies' - self.cmd_list = self.cmd_str.split(',') - if len(self.cmd_list) == 0 : - self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies] + self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies] + if cmd and cmd != '': + self.cmd_list = cmd.split(',') # 入口函数,由基类的方法触发 def custom_start_requests(self): # 根据命令字执行 if self.cmd_astro in self.cmd_list: - self.start_astro() + # 关键:迭代 start_astro 产生的生成器,转发其中的 Request + for req in self.start_astro(): + yield req # 将子函数的 Request 传递给框架 # 按生日获取演员列表 if self.cmd_birth in self.cmd_list: - self.start_birth() + for req in self.start_birth(): + yield req # 将子函数的 Request 传递给框架 # 获取人种列表 if self.cmd_ethnic in self.cmd_list: yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page) - # 获取 distributors 列表 + # 获取 distributors 列表 if self.cmd_dist in self.cmd_list: yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page) @@ -68,6 +70,8 @@ class IAFDSpider(BaseSpider): # 读取待更新的演员列表 if self.cmd_performers in self.cmd_list: actors = db_tools.get_performers(**query_args) + self.crawler.stats.set_value(f"{self.name}/actor_all", len(actors) if actors else 0) + self.crawler.stats.set_value(f"{self.name}/actor_done", 0) if actors: for item in actors: href = item.get('href', '') @@ -78,6 +82,8 @@ class IAFDSpider(BaseSpider): # 读取待更新的影片列表 if self.cmd_movies in self.cmd_list: movies = db_tools.get_movies(**query_args) + self.crawler.stats.set_value(f"{self.name}/movies_all", len(movies) if movies else 0) + self.crawler.stats.set_value(f"{self.name}/movies_done", 0) if movies: for item in movies: href = item.get('href', '') @@ -155,6 +161,8 @@ class IAFDSpider(BaseSpider): div_root = response.css('select#ethnicity1') if div_root: options = div_root.css('option') + self.crawler.stats.set_value(f"{self.name}/ethnic_all", len(options)) + self.crawler.stats.set_value(f"{self.name}/ethnic_done", 0) for option in options: href = option.attrib.get('value') text = option.css('::text').get().strip() @@ -190,6 +198,9 @@ class IAFDSpider(BaseSpider): if next_page: next_url = self.host_url + next_page.attrib['href'] yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic}) + else: + self.crawler.stats.inc_value(f"{self.name}/ethnic_done") + self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}") def parse_distributors_list_page(self, response): select_element = response.css('select[name="Distrib"]') diff --git a/scrapy_proj/scrapy_proj/spiders/javhd_spider.py b/scrapy_proj/scrapy_proj/spiders/javhd_spider.py new file mode 100644 index 0000000..5c6d1f7 --- /dev/null +++ b/scrapy_proj/scrapy_proj/spiders/javhd_spider.py @@ -0,0 +1,191 @@ +import scrapy +import sys +import re +from urllib.parse import urljoin, quote_plus +from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param +from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element +from scrapy_proj.items import JavHDActorIndexItem, JavHDActorItem +from scrapy_proj.comm.comm_def import SPIDER_NAME_JAVHD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL +from scrapy_proj.db_wapper.spider_db_handler import JavHDDBHandler + +db_tools = JavHDDBHandler() + +class JavhdSpider(BaseSpider): + name = SPIDER_NAME_JAVHD + allowed_domains = ["www.javhd.com", "javhd.com"] + + # 区分POST和GET请求的头部配置 + custom_settings = { + # POST请求头(列表页专用) + "POST_HEADERS": { + "accept": "application/json, text/plain, */*", + "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", + "content-type": "application/json", + "origin": "https://javhd.com", + "referer": "https://javhd.com/zh/model", + "sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0", + "x-requested-with": "XMLHttpRequest" + }, + # GET请求头(详情页专用) + "GET_HEADERS": { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", + "priority": "u=0, i", + "referer": "https://javhd.com/zh/model/popular", + "sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0" + }, + "COOKIES_ENABLED": True + } + + def __init__(self, debug='false', mod='update', *args, **kwargs): + super().__init__(*args, **kwargs) + self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False + self.update_mod = False if mod and mod.lower() == 'force' else True + + self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") + + # 入口函数,由基类的方法触发 + def custom_start_requests(self): + lang_list = ['en', 'zh', 'ja'] + for lang in lang_list: + url = f"https://javhd.com/{lang}/model" + # 列表页请求为POST,携带空JSON数据(对应curl的--data-raw '{}') + yield scrapy.Request( + url=url, + method="POST", + body="{}", # POST请求体(空JSON) + headers=self.settings.get("POST_HEADERS"), # 使用POST头 + callback=self.parse_list, + meta={'lang':lang, 'current_page': 1} # 记录当前页码,用于计算排名 + ) + + def parse_list(self, response): + """解析列表页JSON,提取模型列表并跟进详情页""" + try: + # 解析JSON响应(对应curl返回的结果) + data = response.json() + except Exception as e: + self.logger.error(f"列表页JSON解析失败: {e}, 响应内容: {response.text[:500]}") + return + + # 获取当前页码(从meta中读取) + lang = response.meta.get("lang", 'en') + current_page = response.meta.get("current_page", 1) + self.logger.info(f"开始解析第 {current_page} 页数据, url: {response.url}") + + template = data.get("template", "") + thumb_components = re.findall(r']*>', template) + num = current_page + for idx, thumb in enumerate(thumb_components, start=1): + # 计算排名(当前页-1)* 每页数量 + 索引 + rank = (num - 1) * 36 + idx + + # 提取字段(复用你的正则逻辑) + link_content = re.search(r'link-content="(.*?)"', thumb) + url_thumb = re.search(r'url-thumb="(.*?)"', thumb) + title = re.search(r'title="(.*?)"', thumb) + + # 过滤无效数据 + if not url_thumb or not title: + self.logger.warning(f"排名 {rank} 数据不完整,跳过 | 原始数据: {thumb}") + continue + + # 提取字段值 + pic = url_thumb.group(1) + name = title.group(1) + url = link_content.group(1) if link_content else "" + + item = JavHDActorIndexItem() + item['item_type'] = ITEM_TYPE_ACTOR_INDEX + item['rank'] = rank + item['url'] = url + item[f'{lang}_name'] = name + #TODO: 非英语的页面,要去更新对应的名字 + if lang != 'en': + item['url'] = replace_lang_param(item['url']) + yield item + + # 只有英语的才会发起详情 + if url and lang == 'en': + actor_exists = 0 if not self.update_mod else db_tools.has_full_data(url) + if actor_exists < 1 : + yield scrapy.Request( + url=url, + headers=self.settings.get("GET_HEADERS"), # 使用GET头 + callback=self.parse_detail, + meta={"list_item": item} # 传递列表页数据到详情页 + ) + else: + self.logger.info(f"actor(name) has full data. skip. url: {url}") + + # 获取下一页 + next_path = data.get("pagination_params", {}).get("next") + if next_path: + current_url = urljoin(response.url, next_path) + yield scrapy.Request( + url=current_url, + method="POST", + body="{}", # POST请求体(空JSON) + headers=self.settings.get("POST_HEADERS"), # 使用POST头 + callback=self.parse_list, + meta={'lang':lang, 'current_page': current_page+1} # 记录当前页码,用于计算排名 + ) + else: + self.logger.info(f"列表爬取完成, url: {response.url}") + + + def parse_detail(self, response): + list_item = response.meta.get("list_item", {}) + info_section = response.css("div.info__features") + if not info_section: + self.logger.warning(f"未找到 info__features 区块: {href}") + return None + + FIELD_MAPPING = { + "Height": "height", + "Weight": "weight", + "Breast size": "breast_size", + "Breast factor": "breast_factor", + "Hair color": "hair_color", + "Eye color": "eye_color", + "Birth date": "birth_date", + "Ethnicity": "ethnicity", + "Birth place": "birth_place" + } + + item = JavHDActorItem() + item['item_type'] = ITEM_TYPE_ACTOR_DETAIL + item['url'] = response.url + item['is_full_data'] = 1 + item['rank'] = list_item['rank'] + + # 提取 h1.title 中的文本(演员名称) + item['en_name'] = response.css("div.header__info h1.title::text").get(default="").strip() + # 提取 img 的 src 属性(演员头像URL) + item['pic'] = response.css("div.header__info div.avatar img::attr(src)").get(default="").strip() + + # 遍历所有列表项,使用Scrapy的css选择器 + for li in info_section.css("li.content-desc__list-item"): + # 处理文本(复用process_paragraph方法) + title = extract_text_from_element(li.css("strong.content-desc__list-title")) + value = extract_text_from_element(li.css("span.content-desc__list-text")) + + if title and value: + # 通过映射表转换为数据库字段名 + db_field = FIELD_MAPPING.get(title) + if db_field: + item[db_field] = value + + self.logger.info(f"fetch actor({item['en_name']}) data. url: {response.url}") + yield item \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/utils/utils.py b/scrapy_proj/scrapy_proj/utils/utils.py index a935a0b..042d2e3 100644 --- a/scrapy_proj/scrapy_proj/utils/utils.py +++ b/scrapy_proj/scrapy_proj/utils/utils.py @@ -2,6 +2,7 @@ import re import json import os from datetime import datetime, timezone +from urllib.parse import urlparse, urlunparse, parse_qs, urlencode def load_json_file(file_path): # 检查文件是否存在 @@ -101,4 +102,30 @@ def parse_date_to_datetime(date_str): return datetime.strptime(date_str, format_str) # 如果所有格式都不匹配,抛出错误 - return None \ No newline at end of file + return None + + +def replace_lang_param(url: str) -> str: + """ + 将URL中的lang参数统一替换为'en',支持路径中包含lang的情况 + """ + parsed = urlparse(url) + + # 处理路径中的lang参数(如 /ja/model/... 或 /en/model/...) + path_parts = parsed.path.split('/') + if len(path_parts) >= 2 and path_parts[1] in ['en', 'ja', 'zh']: + path_parts[1] = 'en' # 替换第二个路径段为'en' + new_path = '/'.join(path_parts) + else: + new_path = parsed.path + + # 处理查询参数中的lang(如有) + query = parse_qs(parsed.query) + + # 构建新URL + new_parsed = parsed._replace( + path=new_path, + query=urlencode(query, doseq=True) + ) + return urlunparse(new_parsed) +