modify scripts

This commit is contained in:
oscarz
2025-04-23 17:13:35 +08:00
parent f42fd2177b
commit f6385b83e4
4 changed files with 296 additions and 13 deletions

View File

@ -148,10 +148,15 @@ def fetch_movies_by_series():
# 更新演员信息 # 更新演员信息
def fetch_performers_detail(): def fetch_performers_detail():
limit_count = 5 if debug else 100
perfomers_list = [] perfomers_list = []
last_perfomer_id = 0
while True: while True:
# 每次从数据库中取一部分,避免一次全量获取 # 每次从数据库中取一部分,避免一次全量获取
perfomers_list = db_tools.query_actors(is_full_data=0, limit=100) if force: # 从头逐个遍历
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
else: # 只做更新
perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
if len(perfomers_list) < 1: if len(perfomers_list) < 1:
logging.info(f'all performers fetched.') logging.info(f'all performers fetched.')
break break
@ -189,21 +194,27 @@ def fetch_performers_detail():
}) })
if performer_id: if performer_id:
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
last_perfomer_id = performer_id
else: else:
logging.warning(f'insert person: ({person}) {url} failed.') logging.warning(f'insert person: ({person}) {url} failed.')
time.sleep(0.5)
# 调试break # 调试break
if debug: if debug:
return True return True
# 更新影片信息 # 更新影片信息
def fetch_movies_detail(): def fetch_movies_detail():
limit_count = 10 if debug else 100
movies_list = [] movies_list = []
while True: last_movie_id = 0
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100) while True:
if force: # 从头逐个遍历
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
else: # 只做更新
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
if len(movies_list) < 1: if len(movies_list) < 1:
logging.info(f'all movies fetched.') logging.info(f'all movies fetched.')
break break
last_movie_id = 0
succ_count = 0 succ_count = 0
for movie in movies_list: for movie in movies_list:
url = movie['href'] url = movie['href']
@ -231,7 +242,7 @@ def fetch_movies_detail():
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...') logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else: else:
logging.warning(f'fetch_page error. url: {url}') logging.warning(f'fetch_page error. url: {url}')
time.sleep(1) time.sleep(0.5)
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}') logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
# 调试增加break # 调试增加break
if debug: if debug:
@ -253,6 +264,9 @@ function_map = {
def main(cmd, args_debug, args_force): def main(cmd, args_debug, args_force):
global debug global debug
debug = args_debug debug = args_debug
if debug:
logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)
global force global force
force = args_force force = args_force

View File

@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
from requests.exceptions import RequestException from requests.exceptions import RequestException
from functools import partial from functools import partial
import config import config
import utils
# 定义基础 URL 和可变参数 # 定义基础 URL 和可变参数
host_url = "https://www.javdb.com" host_url = "https://www.javdb.com"
@ -24,8 +25,22 @@ headers = {
} }
scraper = cloudscraper.create_scraper() scraper = cloudscraper.create_scraper()
save_raw_html = True
load_from_local = True
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 #使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
if load_from_local: # 从本地读取的逻辑
html = utils.read_raw_html(url)
if html:
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(html) if preprocessor else html
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
logging.info(f"read from local. href: {url}")
return soup, 99 # 返回一个小于100的错误码表明是从本地返回的
for attempt in range(max_retries): for attempt in range(max_retries):
try: try:
if 'javdb.com' not in url.lower(): if 'javdb.com' not in url.lower():
@ -50,6 +65,9 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
logging.warning(f"Page redirected to login page on {url}.") logging.warning(f"Page redirected to login page on {url}.")
return None, 401 return None, 401
if save_raw_html:
utils.write_raw_html(url, response.text)
# 预处理 HTML如果提供了 preprocessor # 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text html_text = preprocessor(response.text) if preprocessor else response.text
@ -223,7 +241,7 @@ def parse_actor_detail(soup, href):
# 解析 HTML 内容,提取需要的数据 # 解析 HTML 内容,提取需要的数据
def parse_movie_detail(soup, href, title): def parse_movie_detail_old(soup, href, title):
div_video = soup.find("div", class_='video-meta-panel') div_video = soup.find("div", class_='video-meta-panel')
if not div_video: if not div_video:
logging.warning(f"Warning: No movies div found ") logging.warning(f"Warning: No movies div found ")
@ -272,6 +290,74 @@ def parse_movie_detail(soup, href, title):
'actors': actors 'actors': actors
} }
# 解析单个元素
def parse_movie_one(soup, keys):
key_strong = soup.find('strong', string=lambda text: text in keys)
if key_strong:
key_span = key_strong.find_next_sibling('span', class_='value')
if key_span:
return key_span.text.strip()
return None
# 解析值和链接
def parse_movie_val_href(soup, keys):
key_strong = soup.find('strong', string=lambda text: text in keys)
if key_strong:
key_span = key_strong.find_next_sibling('span', class_='value')
if key_span:
a_tag = key_span.find('a')
if a_tag:
return a_tag.text.strip(), host_url + a_tag.get('href')
else:
return key_span.text.strip(), None
return None, None
# 解析多个值和链接
def parse_movie_arr(soup, keys):
key_strong = soup.find('strong', string=lambda text: text in keys)
if key_strong:
key_span = key_strong.find_next_sibling('span', class_='value')
if key_span:
actors = []
a_tags = key_span.find_all('a')
for a_tag in a_tags:
actors.append({
'name': a_tag.text.strip(),
'href': host_url + a_tag.get('href')
})
return actors
return []
# 解析 HTML 内容,提取需要的数据
def parse_movie_detail(soup, href, title):
div_video = soup.find("div", class_='video-meta-panel')
if not div_video:
logging.warning(f"Warning: No movies div found ")
return None, None
result = {}
result['href'] = href
result['title'] = title
# 获取封面图片
cover_img = soup.select_one('.column-video-cover a')
result['cover_url'] = cover_img['href'] if cover_img else None
# 获取番号
result['serial_number'] = parse_movie_one(soup, ['番號:', 'ID:'])
result['release_date'] = parse_movie_one(soup, ['日期:', 'Released Date:'])
result['duration'] = parse_movie_one(soup, ['時長:', 'Duration:'])
# 获取maker系列
result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:'])
result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:'])
# 获取演员tags
result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:'])
result['actors'] = parse_movie_arr(soup, ['演員:', 'Actor(s):'])
return result
# 解析 HTML 内容,提取需要的数据 # 解析 HTML 内容,提取需要的数据
def parse_series_uncensored(soup, href): def parse_series_uncensored(soup, href):
div_series = soup.find("div", id='series') div_series = soup.find("div", id='series')

View File

@ -49,7 +49,7 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
performer_id = get_id_by_href('javdb_actors', href) performer_id = get_id_by_href('javdb_actors', href)
if performer_id: if performer_id:
logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}') logging.debug(f"Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}")
return performer_id return performer_id
@ -200,6 +200,33 @@ def query_actors(**filters):
if "is_full_data" in filters: if "is_full_data" in filters:
sql += " AND is_full_data = ?" sql += " AND is_full_data = ?"
params.append(filters["is_full_data"]) params.append(filters["is_full_data"])
if "from_actor_list" in filters:
sql += " AND from_actor_list = ?"
params.append(filters["from_actor_list"])
if "is_full_data_in" in filters:
values = filters["is_full_data_in"]
if values:
placeholders = ", ".join(["?"] * len(values))
sql += f" AND is_full_data IN ({placeholders})"
params.extend(values)
if "is_full_data_not_in" in filters:
values = filters["is_full_data_not_in"]
if values:
placeholders = ", ".join(["?"] * len(values))
sql += f" AND is_full_data NOT IN ({placeholders})"
params.extend(values)
if "before_updated_at" in filters:
sql += " AND updated_at <= ?"
params.append(filters["before_updated_at"])
if "after_updated_at" in filters:
sql += " AND updated_at >= ?"
params.append(filters["after_updated_at"])
if "start_id" in filters:
sql += " AND id > ?"
params.append(filters["start_id"])
if "order_by" in filters:
sql += " order by ? asc"
params.append(filters["order_by"])
if 'limit' in filters: if 'limit' in filters:
sql += " limit ?" sql += " limit ?"
params.append(filters["limit"]) params.append(filters["limit"])
@ -372,13 +399,43 @@ def query_series_hrefs(**filters):
return None return None
# 插入或更新类别 """
def insert_or_update_tags(name, href):
try:
cursor.execute("""
INSERT INTO javdb_tags (name, href, updated_at)
VALUES (?, ? , datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
updated_at = datetime('now', 'localtime')
""", (name, href))
conn.commit()
# 获取 performer_id
cursor.execute("SELECT id FROM javdb_tags WHERE href = ?", (href,))
dist_id = cursor.fetchone()[0]
if dist_id:
logging.debug(f"insert/update tags succ. id: {dist_id}, name: {name}")
return dist_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# """插入或更新电影数据""" # """插入或更新电影数据"""
def insert_or_update_movie(movie): def insert_or_update_movie(movie):
try: try:
# 获取相关 ID # 获取相关 ID
makers_id = get_id_by_href('javdb_makers', movie['maker_link']) makers_id = get_id_by_href('javdb_makers', movie['maker_link']) if movie['maker_link'] else None
series_id = get_id_by_href('javdb_series', movie['series_link']) series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None
# 如果不存在,插入
if makers_id is None and movie['maker_link']:
makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')})
if series_id is None and movie['series_link']:
series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')})
cursor.execute(""" cursor.execute("""
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration, INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
@ -404,7 +461,7 @@ def insert_or_update_movie(movie):
if movie_id is None: if movie_id is None:
return None return None
logging.debug(f'insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}') logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}")
# 插入 performers_movies 关系表 # 插入 performers_movies 关系表
for performer in movie.get('actors', []): for performer in movie.get('actors', []):
@ -412,14 +469,23 @@ def insert_or_update_movie(movie):
# 如果演员不存在,先插入 # 如果演员不存在,先插入
if performer_id is None: if performer_id is None:
performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1) performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}")
if performer_id: if performer_id:
tmp_id = insert_actor_movie(performer_id, movie_id) tmp_id = insert_actor_movie(performer_id, movie_id)
if tmp_id: if tmp_id:
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}") logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
else: else:
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}') logging.debug(f"insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}")
else: else:
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}') logging.warning(f"insert perfomer failed. name: {performer['name']}, href: {performer['href']}")
# 插入 tags 表
for tag in movie.get('tags', []):
tag_name = tag.get('name', '')
tag_href = tag.get('href', '')
tag_id = insert_or_update_tags(tag_name, tag_href)
if tag_id:
logging.debug(f"insert one tags. tag_id: {tag_id}, name:{tag_name}")
return movie_id return movie_id
@ -516,6 +582,33 @@ def query_movie_hrefs(**filters):
if "is_full_data" in filters: if "is_full_data" in filters:
sql += " AND is_full_data = ?" sql += " AND is_full_data = ?"
params.append(filters["is_full_data"]) params.append(filters["is_full_data"])
if "from_actor_list" in filters:
sql += " AND from_actor_list = ?"
params.append(filters["from_actor_list"])
if "is_full_data_in" in filters:
values = filters["is_full_data_in"]
if values:
placeholders = ", ".join(["?"] * len(values))
sql += f" AND is_full_data IN ({placeholders})"
params.extend(values)
if "is_full_data_not_in" in filters:
values = filters["is_full_data_not_in"]
if values:
placeholders = ", ".join(["?"] * len(values))
sql += f" AND is_full_data NOT IN ({placeholders})"
params.extend(values)
if "before_updated_at" in filters:
sql += " AND updated_at <= ?"
params.append(filters["before_updated_at"])
if "after_updated_at" in filters:
sql += " AND updated_at >= ?"
params.append(filters["after_updated_at"])
if "start_id" in filters:
sql += " AND id > ?"
params.append(filters["start_id"])
if "order_by" in filters:
sql += " order by ?"
params.append(filters["order_by"])
if 'limit' in filters: if 'limit' in filters:
sql += " limit ?" sql += " limit ?"
params.append(filters["limit"]) params.append(filters["limit"])

View File

@ -3,8 +3,98 @@ import os
import json import json
import time import time
import csv import csv
from datetime import datetime
from urllib.parse import urlparse from urllib.parse import urlparse
import logging import logging
import config
update_dir = f'{config.global_host_data_dir}/javdb'
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 只提取movies url
def extract_id_from_href(href):
# 检查 URL 是否符合要求
if 'javdb.com/v/' in href:
# 定义正则表达式模式
pattern = r'javdb.com/v/([^?&]+)'
# 查找匹配项
match = re.search(pattern, href)
if match:
# 提取匹配的字符串并转换为小写
result = match.group(1).lower()
return result
return ''
# 保存抓取到的原始HTML方便后续核验
def write_raw_html(href, html_text):
# 获取目录
id = extract_id_from_href(href)
if 'javdb.com/v/' in href.lower():
dir_prefix = 'raw_movies'
else:
return
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
file_name = f"{id}.html" # 用 - 替换空格
full_path = os.path.join(file_dir, file_name)
try:
with open(full_path, 'w', encoding='utf-8') as file:
file.write(html_text)
except FileNotFoundError:
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
except PermissionError:
logging.warning(f"错误:没有权限写入文件 {full_path}")
except Exception as e:
logging.warning(f"发生未知错误:{e}")
# 保存抓取到的原始HTML方便后续核验
def read_raw_html(href, expire_date_str="2025-03-01"):
# 获取目录
id = extract_id_from_href(href)
if 'javdb.com/v/' in href.lower():
dir_prefix = 'raw_movies'
else:
return
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
file_name = f"{id}.html" # 用 - 替换空格
full_path = os.path.join(file_dir, file_name)
try:
if os.path.exists(full_path):
# 获取文件的最后修改时间
last_modified_timestamp = os.path.getmtime(full_path)
# 将时间戳转换为 datetime 对象
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
# 检查文件最后修改时间是否晚于给定日期
expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
if last_modified_date > expire_date:
logging.debug(f"find local file on href {href}")
with open(full_path, 'r', encoding='utf-8') as file:
return file.read()
else:
logging.debug(f"expired file {last_modified_date} on href {href}")
return None
else:
return None
except FileNotFoundError:
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
except PermissionError:
logging.warning(f"错误:没有权限读取文件 {full_path}")
except Exception as e:
logging.warning(f"发生未知错误:{e}")
return None
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数 # 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数