modify scripts

This commit is contained in:
oscarz
2025-03-17 11:30:35 +08:00
parent e6327fbe73
commit d5dc76b87f
178 changed files with 44 additions and 184447 deletions

87
javdb/src/config.py Normal file
View File

@ -0,0 +1,87 @@
import logging
import os
import inspect
import time
from datetime import datetime
from logging.handlers import RotatingFileHandler
from collections import defaultdict
home_dir = os.path.expanduser("~")
global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
global_share_data_dir = f'{home_dir}/sharedata'
# 统计日志频率
log_count = defaultdict(int) # 记录日志的次数
last_log_time = defaultdict(float) # 记录上次写入的时间戳
class RateLimitFilter(logging.Filter):
"""
频率限制过滤器:
1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略
2. 如果日志速率超过 100 条/秒,发出告警
"""
LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次
def filter(self, record):
global log_count, last_log_time
message_key = record.getMessage() # 获取日志内容
# 计算当前时间
now = time.time()
elapsed = now - last_log_time[message_key]
# 限制相同日志的写入频率
if elapsed < 60: # 60 秒内
log_count[message_key] += 1
if log_count[message_key] > self.LOG_LIMIT:
print('reach limit.')
return False # 直接丢弃
else:
log_count[message_key] = 1 # 超过 60 秒,重新计数
last_log_time[message_key] = now
return True # 允许写入日志
def setup_logging(log_filename=None):
if log_filename is None:
caller_frame = inspect.stack()[1]
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
current_date = datetime.now().strftime('%Y%m%d')
log_filename = f'../log/{caller_filename}_{current_date}.log'
max_log_size = 100 * 1024 * 1024 # 10 MB
max_log_files = 10 # 最多保留 10 个日志文件
file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
file_handler.setFormatter(logging.Formatter(
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
))
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter(
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
))
# 创建 logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [] # 避免重复添加 handler
logger.addHandler(file_handler)
logger.addHandler(console_handler)
# 添加频率限制
rate_limit_filter = RateLimitFilter()
file_handler.addFilter(rate_limit_filter)
console_handler.addFilter(rate_limit_filter)
# 运行示例
if __name__ == "__main__":
setup_logging()
for i in range(1000):
logging.info("测试日志,检测频率限制")
time.sleep(0.01) # 模拟快速写入日志

294
javdb/src/fetch.py Normal file
View File

@ -0,0 +1,294 @@
import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import scraper
import utils
config.setup_logging()
debug = False
force = False
# 获取演员列表
def fetch_actor_list():
next_url = scraper.actors_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
if actor_id:
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 获取makers列表
def fetch_makers_list():
next_url = scraper.makers_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
maker_id = db_tools.insert_or_update_makers(row)
if maker_id:
logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 获取series列表
def fetch_series_list():
next_url = scraper.series_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
maker_id = db_tools.insert_or_update_series(row)
if maker_id:
logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 更新makers列表中的影片信息
def fetch_movies_by_maker():
url_list = db_tools.query_maker_hrefs()
if debug:
url_list = db_tools.query_maker_hrefs(name='muramura')
for url in url_list:
# 去掉可下载的标志(如果有)
next_url = utils.remove_url_query(url)
while next_url:
logging.info(f"Fetching data for maker url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
if list_data:
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1)
if tmp_id:
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
else:
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
else :
logging.warning(f'parse_page_movie error. url: {next_url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 调试增加brak
if debug:
return True
# 更新series列表中的影片信息
def fetch_movies_by_series():
url_list = db_tools.query_series_hrefs()
if debug:
url_list = db_tools.query_series_hrefs(name='10musume')
for url in url_list:
# 去掉可下载的标志(如果有)
next_url = utils.remove_url_query(url)
while next_url:
logging.info(f"Fetching data for series url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_series_detail(soup, next_url)
if list_data:
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1)
if tmp_id:
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
else:
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
else :
logging.warning(f'parse_page_movie error. url: {next_url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 调试增加brak
if debug:
return True
# 更新演员信息
def fetch_performers_detail():
perfomers_list = []
while True:
# 每次从数据库中取一部分,避免一次全量获取
perfomers_list = db_tools.query_actors(is_full_data=0, limit=100)
if len(perfomers_list) < 1:
logging.info(f'all performers fetched.')
break
for performer in perfomers_list:
url = performer['href']
person = performer['name']
pic = ''
alias = []
next_url = url
all_movies = []
while next_url:
logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
if soup:
data, next_url = scraper.parse_actor_detail(soup, next_url)
if data:
pic = data.get('pic', '')
alias = data.get('alias', [])
all_movies.extend(data.get('movies', []))
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
break
else:
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
# 获取完了个人的所有影片,开始插入数据
performer_id = db_tools.insert_or_update_actor({
'href': url,
'name': person,
'pic' : pic,
'alias' : alias,
'credits':all_movies
})
if performer_id:
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
else:
logging.warning(f'insert person: ({person}) {url} failed.')
# 调试break
if debug:
return True
# 更新影片信息
def fetch_movies_detail():
movies_list = []
while True:
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100)
if len(movies_list) < 1:
logging.info(f'all movies fetched.')
break
for movie in movies_list:
url = movie['href']
title = movie['title']
logging.info(f"Fetching data for movie ({title}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
if soup:
movie_data = scraper.parse_movie_detail(soup, url, title)
if movie_data :
movie_id = db_tools.insert_or_update_movie(movie_data)
if movie_id:
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
else:
logging.warning(f'insert movie {url} failed.')
else:
logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
else:
logging.warning(f'fetch_page error. url: {url}')
time.sleep(1)
# 调试增加break
if debug:
return True
# 建立缩写到函数的映射
function_map = {
"actor_list": fetch_actor_list,
"maker_list": fetch_makers_list,
"series_list": fetch_series_list,
"makers": fetch_movies_by_maker,
"series" : fetch_movies_by_series,
"movies" : fetch_movies_detail,
"actors" : fetch_performers_detail,
}
# 主函数
def main(cmd, args_debug, args_force):
global debug
debug = args_debug
global force
force = args_force
# 开启任务
task_id = db_tools.insert_task_log()
if task_id is None:
logging.warning(f'insert task log error.')
return None
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
# 执行指定的函数
if cmd:
function_names = args.cmd.split(",") # 拆分输入
for short_name in function_names:
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
if callable(func):
db_tools.update_task_log(task_id, task_status=f'Running {short_name}')
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
else: # 全量执行
for name, func in function_map.items():
if callable(func):
db_tools.update_task_log(task_id, task_status=f'Running {name}')
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
logging.info(f'all process completed!')
db_tools.finalize_task_log(task_id)
# TODO:
# 1,
if __name__ == "__main__":
# 命令行参数处理
keys_str = ",".join(function_map.keys())
parser = argparse.ArgumentParser(description='fetch javdb data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
args = parser.parse_args()
main(args.cmd, args.debug, args.force)

504
javdb/src/scraper.py Normal file
View File

@ -0,0 +1,504 @@
import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
import config
# 定义基础 URL 和可变参数
host_url = "https://www.javdb.com"
actors_uncensored_base_url = f'{host_url}/actors/uncensored'
series_uncensored_base_url = f'{host_url}/series/uncensored'
makers_uncensored_base_url = f'{host_url}/makers/uncensored'
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
for attempt in range(max_retries):
try:
if 'javdb.com' not in url.lower():
logging.error(f'wrong url format: {url}')
return None, None
response = scraper.get(url, headers=headers)
# 处理 HTTP 状态码
if response.status_code == 404:
logging.warning(f"Page not found (404): {url}")
return None, 404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, response.status_code
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except cloudscraper.exceptions.CloudflareChallengeError as e:
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
except cloudscraper.exceptions.CloudflareCode1020 as e:
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retring...")
logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
def preprocess_html(html):
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 解析链接中的页码
def url_page_num(href):
if href is None:
return None
match = re.search(r'page=(\d+)', href)
if match:
next_page_number = int(match.group(1))
return next_page_number
else:
return None
# <span class="avatar" style="background-image: url(https://c0.jdbstatic.com/avatars/md/mdRn.jpg)"></span>
def parse_avatar_image(soup):
try:
span = soup.find("span", class_="avatar")
if not span:
return "" # 没有找到 <span> 元素,返回空字符串
style = span.get("style", "")
match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
return match.group(1) if match else "" # 解析成功返回 URL否则返回空字符串
except Exception as e:
return "" # 发生异常时,返回空字符串
# 解析 HTML 内容,提取需要的数据
def parse_actors_uncensored(soup, href):
div_actors = soup.find("div", id='actors')
if not div_actors:
logging.warning(f"Warning: No actors div found ")
return None, None
# 解析元素
rows = div_actors.find_all('div', class_='box actor-box')
list_data = []
next_url = None
for row in rows:
# 获取演员详情链接
actor_link = row.find('a')['href']
# 获取演员名字
actor_name = row.find('strong').text.strip()
# 获取头像图片链接
avatar_url = row.find('img', class_='avatar')['src']
# 获取 title 属性中的别名
alias_list = row.find('a')['title'].split(", ")
list_data.append({
'name' : actor_name,
'href' : host_url + actor_link if actor_link else '',
'pic' : avatar_url,
'alias': alias_list
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = url_page_num(next_page_url)
current_page_number = url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number :
next_url = host_url + next_page_url
return list_data, next_url
# 解析 HTML 内容,提取需要的数据
def parse_actor_detail(soup, href):
# 先找一下别名
alias_list = []
div_meta = soup.find('span', class_='actor-section-name')
if not div_meta:
logging.warning(f'warning: no meta data found in page {href}')
return None, None
alias_div = soup.find('div', class_='column section-title')
if alias_div:
meta_list = alias_div.find_all('span', class_='section-meta')
if len(meta_list) > 1:
alias_list = meta_list[0].text.strip().split(", ")
# 头像
pic = ''
avatar = soup.find("div", class_="column actor-avatar")
if avatar:
pic = parse_avatar_image(avatar)
# 返回数据
actor = {}
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return None, None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href' : host_url + link if link else '',
'serial_number' : serial_number,
'title' : title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = url_page_num(next_page_url)
current_page_number = url_page_num(href)
logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number :
next_url = host_url + next_page_url
actor = {
'pic' : pic,
'alias' : alias_list,
'movies' : list_data
}
return actor, next_url
# 解析 HTML 内容,提取需要的数据
def parse_movie_detail(soup, href, title):
div_video = soup.find("div", class_='video-meta-panel')
if not div_video:
logging.warning(f"Warning: No movies div found ")
return None, None
# 获取封面图片
cover_img = soup.select_one('.column-video-cover a')
cover_url = cover_img['href'] if cover_img else None
# 获取番号
serial = soup.select_one('.panel-block:first-child .value')
serial_number = serial.text.strip() if serial else None
# 获取日期
date = soup.select_one('.panel-block:nth-of-type(2) .value')
release_date = date.text.strip() if date else None
# 获取时长
duration = soup.select_one('.panel-block:nth-of-type(3) .value')
video_duration = duration.text.strip() if duration else None
# 获取片商
maker = soup.select_one('.panel-block:nth-of-type(4) .value a')
maker_name = maker.text.strip() if maker else None
maker_link = maker['href'] if maker else None
# 获取系列
series = soup.select_one('.panel-block:nth-of-type(5) .value a')
series_name = series.text.strip() if series else None
series_link = series['href'] if series else None
# 获取演员(名字 + 链接)
actors = [{'name': actor.text.strip(), 'href': host_url + actor['href']} for actor in soup.select('.panel-block:nth-of-type(8) .value a')]
return {
'href' : href,
'title' : title,
'cover_url': cover_url,
'serial_number': serial_number,
'release_date': release_date,
'duration': video_duration,
'maker_name': maker_name,
'maker_link': host_url + maker_link if maker_link else '',
'series_name': series_name,
'series_link': host_url + series_link if series_link else '',
'actors': actors
}
# 解析 HTML 内容,提取需要的数据
def parse_series_uncensored(soup, href):
div_series = soup.find("div", id='series')
if not div_series:
logging.warning(f"Warning: No div_series div found ")
return None, None
# 解析元素
rows = div_series.find_all('a', class_='box')
list_data = []
next_url = None
for row in rows:
name = row.find('strong').text.strip()
href = row['href']
div_movies = row.find('span')
movies = 0
if div_movies:
match = re.search(r'\((\d+)\)', div_movies.text.strip())
if match:
movies = int(match.group(1))
list_data.append({
'name' : name,
'href' : host_url + href if href else '',
'movies' : movies
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = url_page_num(next_page_url)
current_page_number = url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number :
next_url = host_url + next_page_url
return list_data, next_url
# 解析 HTML 内容,提取需要的数据
def parse_series_detail(soup, href):
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href' : host_url + link if link else '',
'serial_number' : serial_number,
'title' : title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = url_page_num(next_page_url)
current_page_number = url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number :
next_url = host_url + next_page_url
return list_data, next_url
# 解析 HTML 内容,提取需要的数据
def parse_makers_uncensored(soup, href):
div_series = soup.find("div", id='makers')
if not div_series:
logging.warning(f"Warning: No makers div found ")
return None, None
# 解析元素
rows = div_series.find_all('a', class_='box')
list_data = []
next_url = None
for row in rows:
name = row.find('strong').text.strip()
href = row['href']
div_movies = row.find('span')
movies = 0
if div_movies:
match = re.search(r'\((\d+)\)', div_movies.text.strip())
if match:
movies = int(match.group(1))
list_data.append({
'name' : name,
'href' : host_url + href if href else '',
'movies' : movies
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = url_page_num(next_page_url)
current_page_number = url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number :
next_url = host_url + next_page_url
return list_data, next_url
# 解析 HTML 内容,提取需要的数据
def parse_maker_detail(soup, href):
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href' : host_url + link if link else '',
'serial_number' : serial_number,
'title' : title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = url_page_num(next_page_url)
current_page_number = url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number :
next_url = host_url + next_page_url
return list_data, next_url
###### 以下为测试代码 ######
def test_actors_list():
next_url = actors_uncensored_base_url
while next_url:
print(f'fetching page {next_url}')
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="actors", attr_type="id"))
if soup:
list_data, next_url = parse_actors_uncensored(soup, next_url)
if list_data :
print(list_data)
else:
print('get wrong page.')
if next_url:
print(next_url)
break
def test_actor():
next_url = 'https://javdb.com/actors/mdRn'
all_data = []
while next_url:
print(f'fetching page {next_url}')
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
if soup:
list_data, next_url = parse_actor_detail(soup, next_url)
if list_data :
all_data.extend(list_data)
else:
print('get wrong page.')
print(all_data)
def test_movie_detail():
movie_url = 'https://javdb.com/v/gB2Q7'
while True:
soup = fetch_page(movie_url, partial(generic_validator, tag="div", identifier="video-detail", attr_type="class"))
if soup:
detail = parse_movie_detail(soup, movie_url, 'RED193 無碼 レッドホットフェティッシュコレクション 中出し120連発 4 : 波多野結衣, 愛乃なみ, 夢実あくび, 他多数')
if detail:
print(detail)
break
def test_series_list():
next_url = 'https://javdb.com/series/uncensored'
all_data = []
while next_url:
print(f'fetching page {next_url}')
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="series", attr_type="id"))
if soup:
list_data, next_url = parse_series_uncensored(soup, next_url)
if list_data :
all_data.extend(list_data)
else:
print('get wrong page.')
break
print(all_data)
def test_series_detail():
next_url = 'https://javdb.com/series/39za'
all_data = []
while next_url:
print(f'fetching page {next_url}')
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
if soup:
list_data, next_url = parse_series_detail(soup, next_url)
if list_data :
all_data.extend(list_data)
else:
print('get wrong page.')
print(all_data)
if __name__ == "__main__":
#test_actors_list()
#test_actor()
test_movie_detail()
#test_series_list()
#test_series_detail()

599
javdb/src/sqlite_utils.py Normal file
View File

@ -0,0 +1,599 @@
import sqlite3
import json
import config
import logging
from datetime import datetime
# 连接 SQLite 数据库
DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
cursor = conn.cursor()
# """从指定表中通过 href 查找 id"""
def get_id_by_href(table: str, href: str) -> int:
if href is None:
return None
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
row = cursor.fetchone()
return row[0] if row else None
def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
try:
# **查询是否已存在该演员**
cursor.execute("SELECT id, name, from_actor_list, from_movie_list FROM javdb_actors WHERE href = ?", (href,))
existing_actor = cursor.fetchone()
if existing_actor: # **如果演员已存在**
actor_id, existing_name, existing_actor_list, existing_movie_list = existing_actor
# **如果没有传入值,则保持原有值**
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor_list
from_movie_list = from_movie_list if from_movie_list is not None else existing_movie_list
cursor.execute("""
UPDATE javdb_actors
SET name = ?,
from_actor_list = ?,
from_movie_list = ?,
updated_at = datetime('now', 'localtime')
WHERE href = ?
""", (name, from_actor_list, from_movie_list, href))
else: # **如果演员不存在,插入**
cursor.execute("""
INSERT INTO javdb_actors (href, name, from_actor_list, from_movie_list)
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0))
""", (href, name, from_actor_list, from_movie_list))
conn.commit()
performer_id = get_id_by_href('javdb_actors', href)
if performer_id:
logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}')
return performer_id
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
logging.error(f"未知错误: {e}")
return None
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None):
try:
# **先检查数据库中是否已有该电影**
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series FROM javdb_movies WHERE href = ?", (href,))
existing_movie = cursor.fetchone()
if existing_movie: # **如果电影已存在**
movie_id, existing_actor, existing_maker, existing_series = existing_movie
# **如果没有传入值,就用原来的值**
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor
from_movie_makers = from_movie_makers if from_movie_makers is not None else existing_maker
from_movie_series = from_movie_series if from_movie_series is not None else existing_series
cursor.execute("""
UPDATE javdb_movies
SET title = ?,
from_actor_list = ?,
from_movie_makers = ?,
from_movie_series = ?,
updated_at = datetime('now', 'localtime')
WHERE href = ?
""", (title, from_actor_list, from_movie_makers, from_movie_series, href))
else: # **如果电影不存在,插入**
cursor.execute("""
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series)
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
""", (title, href, from_actor_list, from_movie_makers, from_movie_series))
conn.commit()
movie_id = get_id_by_href('javdb_movies', href)
if movie_id:
logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
return movie_id
except Exception as e:
conn.rollback()
logging.error(f"Error inserting/updating movie: {e}")
return None
# 插入演员和电影的关联数据
def insert_actor_movie(performer_id, movie_id, tags=''):
try:
cursor.execute("""
INSERT INTO javdb_actors_movies (actor_id, movie_id, tags)
VALUES (?, ?, ?)
ON CONFLICT(actor_id, movie_id) DO UPDATE SET tags=excluded.tags
""",
(performer_id, movie_id, tags)
)
conn.commit()
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
return performer_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 插入演员数据
def insert_or_update_actor(actor):
try:
cursor.execute('''
INSERT INTO javdb_actors (name, href, pic, is_full_data, updated_at)
VALUES (?, ?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET name=excluded.name, pic=excluded.pic, is_full_data=1, updated_at=datetime('now', 'localtime')
''', (actor['name'], actor['href'], actor['pic']))
cursor.execute('SELECT id FROM javdb_actors WHERE href = ?', (actor['href'],))
conn.commit()
actor_id = get_id_by_href('javdb_actors', actor['href'])
if actor_id is None:
logging.warning(f'insert data error. name: {actor['name']}, href: {actor['href']}')
return None
logging.debug(f'insert one actor, id: {actor_id}, name: {actor['name']}, href: {actor['href']}')
# 插入别名
for alias in actor.get("alias") or []:
cursor.execute('''
INSERT OR IGNORE INTO javdb_actors_alias (actor_id, alias, updated_at)
VALUES (?, ?, datetime('now', 'localtime'))
''', (actor_id, alias))
conn.commit()
# 插入影片列表
for movie in actor.get("credits") or []:
movie_id = get_id_by_href('javdb_movies', movie['href'])
# 影片不存在,先插入
if movie_id is None:
movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1)
if movie_id:
tmp_id = insert_actor_movie(actor_id, movie_id)
if tmp_id :
logging.debug(f'insert one performer_movie, performer_id: {actor_id}, movie_id: {movie_id}')
else:
logging.warning(f'insert performer_movie failed. performer_id: {actor_id}, moive href: {movie['href']}')
return actor_id
except Exception as e:
logging.error(f"插入/更新演员 {actor['name']} 失败: {e}")
conn.rollback()
# 删除演员
def delete_actor_by_href(href):
try:
cursor.execute('DELETE FROM javdb_actors WHERE href = ?', (href,))
conn.commit()
logging.info(f"成功删除演员: {href}")
except Exception as e:
logging.error(f"删除演员 {href} 失败: {e}")
conn.rollback()
# 查询
def query_actors(**filters):
try:
sql = "SELECT href, name FROM javdb_actors WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
if "is_full_data" in filters:
sql += " AND is_full_data = ?"
params.append(filters["is_full_data"])
if 'limit' in filters:
sql += " limit ?"
params.append(filters["limit"])
cursor.execute(sql, params)
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 插入或更新发行商 """
def insert_or_update_makers(data):
try:
cursor.execute("""
INSERT INTO javdb_makers (name, href, updated_at)
VALUES (?, ? , datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
# 获取 performer_id
cursor.execute("SELECT id FROM javdb_makers WHERE href = ?", (data["href"],))
dist_id = cursor.fetchone()[0]
if dist_id:
logging.debug(f"成功插入/更新发行商: {data['name']}")
return dist_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# 删除发行商(按 id 或 name """
def delete_maker(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM javdb_makers WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM javdb_makers WHERE name = ?", (identifier,))
conn.commit()
logging.info(f"成功删除发行商: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# 查询发行商(按 id 或 name """
def query_maker(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM javdb_makers WHERE id = ?", (identifier,))
else:
cursor.execute("SELECT * FROM javdb_makers WHERE name LIKE ?", (f"%{identifier}%",))
distributor = cursor.fetchone()
if distributor:
return dict(zip([desc[0] for desc in cursor.description], distributor))
else:
logging.warning(f"未找到发行商: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_maker_hrefs(**filters):
try:
sql = "SELECT href FROM javdb_makers WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "url" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# """ 插入或更新制作公司 """
def insert_or_update_series(data):
try:
cursor.execute("""
INSERT INTO javdb_series (name, href, updated_at)
VALUES (?, ?, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
# 获取 performer_id
cursor.execute("SELECT id FROM javdb_series WHERE href = ?", (data["href"],))
stu_id = cursor.fetchone()[0]
if stu_id:
logging.debug(f"成功插入/更新发行商: {data['name']}")
return stu_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# """ 删除制作公司(按 id 或 name """
def delete_series(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM javdb_series WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM javdb_series WHERE name = ?", (identifier,))
conn.commit()
logging.info(f"成功删除制作公司: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# """ 查询制作公司(按 id 或 name """
def query_series(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM javdb_series WHERE id = ?", (identifier,))
else:
cursor.execute("SELECT * FROM javdb_series WHERE name LIKE ?", (f"%{identifier}%",))
studio = cursor.fetchone()
if studio:
return dict(zip([desc[0] for desc in cursor.description], studio))
else:
logging.warning(f"未找到制作公司: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_series_hrefs(**filters):
try:
sql = "SELECT href FROM javdb_series WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# """插入或更新电影数据"""
def insert_or_update_movie(movie):
try:
# 获取相关 ID
makers_id = get_id_by_href('javdb_makers', movie['maker_link'])
series_id = get_id_by_href('javdb_series', movie['series_link'])
cursor.execute("""
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
maker_id, series_id, is_full_data, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
title=excluded.title,
cover_url=excluded.cover_url,
serial_number=excluded.serial_number,
release_date=excluded.release_date,
duration=excluded.duration,
maker_id=excluded.maker_id,
series_id=excluded.series_id,
is_full_data=1,
updated_at=datetime('now', 'localtime')
""", (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'],
movie['release_date'], movie['duration'], makers_id, series_id))
conn.commit()
# 获取插入的 movie_id
movie_id = get_id_by_href('javdb_movies', movie['href'])
if movie_id is None:
return None
logging.debug(f'insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}')
# 插入 performers_movies 关系表
for performer in movie.get('actors', []):
performer_id = get_id_by_href('javdb_actors', performer['href'])
# 如果演员不存在,先插入
if performer_id is None:
performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
if performer_id:
tmp_id = insert_actor_movie(performer_id, movie_id)
if tmp_id:
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
else:
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
else:
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
return movie_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 删除电影数据"""
def delete_movie(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM javdb_movies WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM javdb_movies WHERE href = ?", (identifier,))
else:
logging.warning("无效的删除参数")
return
conn.commit()
logging.info(f"Deleted movie with {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error("Error deleting movie: %s", e)
# 查找电影数据"""
def query_movies(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM javdb_movies WHERE id = ?", (identifier,))
elif "http" in identifier:
cursor.execute("SELECT * FROM javdb_movies WHERE href = ?", (identifier,))
else:
cursor.execute("SELECT * FROM javdb_movies WHERE title LIKE ?", (f"%{identifier}%",))
movie = cursor.fetchone()
if movie:
cursor.execute("SELECT * FROM javdb_actors_movies WHERE performer_id = ?", (movie[0],))
performers = [row[0] for row in cursor.fetchall()]
result = dict(zip([desc[0] for desc in cursor.description], performers))
result["performers"] = performers
return result
else:
logging.warning(f"find no data: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_movie_hrefs(**filters):
try:
sql = "SELECT href, title FROM javdb_movies WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "title" in filters:
sql += " AND title LIKE ?"
params.append(f"%{filters['title']}%")
if "is_full_data" in filters:
sql += " AND is_full_data = ?"
params.append(filters["is_full_data"])
if 'limit' in filters:
sql += " limit ?"
params.append(filters["limit"])
cursor.execute(sql, params)
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return []
# 插入一条任务日志
def insert_task_log():
try:
cursor.execute("""
INSERT INTO javdb_task_log (task_status) VALUES ('Start')
""")
conn.commit()
task_id = cursor.lastrowid
if task_id is None:
return None
update_task_log(task_id=task_id, task_status='Start')
return task_id # 获取插入的 task_id
except sqlite3.Error as e:
logging.error(f"插入任务失败: {e}")
return None
# 更新任务日志的字段
def update_task_log_inner(task_id, **kwargs):
try:
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
params = list(kwargs.values()) + [task_id]
sql = f"UPDATE javdb_task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
cursor.execute(sql, params)
conn.commit()
except sqlite3.Error as e:
logging.error(f"更新任务 {task_id} 失败: {e}")
# 更新任务日志的字段
def update_task_log(task_id, task_status):
try:
# 获取 performers、studios 等表的最终行数
cursor.execute("SELECT COUNT(*) FROM javdb_actors where is_full_data=1")
full_data_actors = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM javdb_actors")
total_actors = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM javdb_movies where is_full_data=1")
full_data_movies = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM javdb_movies")
total_movies = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM javdb_makers")
total_makers = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM javdb_series")
total_series = cursor.fetchone()[0]
# 更新 task_log
update_task_log_inner(task_id,
full_data_actors=full_data_actors,
total_actors=total_actors,
full_data_movies=full_data_movies,
total_movies=total_movies,
total_makers=total_makers,
total_series=total_series,
task_status=task_status)
except sqlite3.Error as e:
logging.error(f"更新任务 {task_id} 失败: {e}")
# 任务结束,更新字段
def finalize_task_log(task_id):
try:
# 更新 task_log
update_task_log(task_id, task_status="Success")
except sqlite3.Error as e:
logging.error(f"任务 {task_id} 结束失败: {e}")
# 测试代码
if __name__ == "__main__":
sample_data = [
{
'name': '上原亜衣',
'href': 'https://www.javdb.com/actors/MkAX',
'pic': 'https://c0.jdbstatic.com/avatars/mk/MkAX.jpg',
'alias': ['上原亜衣', '下原舞', '早瀬クリスタル', '阿蘇山百式屏風奉行']
},
{
'name': '大橋未久',
'href': 'https://www.javdb.com/actors/21Jp',
'pic': 'https://c0.jdbstatic.com/avatars/21/21Jp.jpg',
'alias': ['大橋未久']
},
]
for actor in sample_data:
insert_or_update_actor(actor)
print(query_actors("name LIKE '%未久%'"))
#delete_actor_by_href('https://www.javdb.com/actors/MkAX')
print(query_actors())

18
javdb/src/utils.py Normal file
View File

@ -0,0 +1,18 @@
import re
import os
import json
import time
import csv
from urllib.parse import urlparse
import logging
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
def remove_url_query(url: str) -> str:
try:
parsed_url = urlparse(url)
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
return clean_url
except Exception as e:
print(f"解析 URL 失败: {e}")
return url