modify scripts
This commit is contained in:
87
javdb/src/config.py
Normal file
87
javdb/src/config.py
Normal file
@ -0,0 +1,87 @@
|
||||
import logging
|
||||
import os
|
||||
import inspect
|
||||
import time
|
||||
from datetime import datetime
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from collections import defaultdict
|
||||
|
||||
home_dir = os.path.expanduser("~")
|
||||
global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
|
||||
global_share_data_dir = f'{home_dir}/sharedata'
|
||||
|
||||
|
||||
# 统计日志频率
|
||||
log_count = defaultdict(int) # 记录日志的次数
|
||||
last_log_time = defaultdict(float) # 记录上次写入的时间戳
|
||||
|
||||
class RateLimitFilter(logging.Filter):
|
||||
"""
|
||||
频率限制过滤器:
|
||||
1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略
|
||||
2. 如果日志速率超过 100 条/秒,发出告警
|
||||
"""
|
||||
LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次
|
||||
|
||||
def filter(self, record):
|
||||
global log_count, last_log_time
|
||||
message_key = record.getMessage() # 获取日志内容
|
||||
|
||||
# 计算当前时间
|
||||
now = time.time()
|
||||
elapsed = now - last_log_time[message_key]
|
||||
|
||||
# 限制相同日志的写入频率
|
||||
if elapsed < 60: # 60 秒内
|
||||
log_count[message_key] += 1
|
||||
if log_count[message_key] > self.LOG_LIMIT:
|
||||
print('reach limit.')
|
||||
return False # 直接丢弃
|
||||
else:
|
||||
log_count[message_key] = 1 # 超过 60 秒,重新计数
|
||||
|
||||
last_log_time[message_key] = now
|
||||
|
||||
return True # 允许写入日志
|
||||
|
||||
|
||||
def setup_logging(log_filename=None):
|
||||
if log_filename is None:
|
||||
caller_frame = inspect.stack()[1]
|
||||
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
log_filename = f'../log/{caller_filename}_{current_date}.log'
|
||||
|
||||
max_log_size = 100 * 1024 * 1024 # 10 MB
|
||||
max_log_files = 10 # 最多保留 10 个日志文件
|
||||
|
||||
file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
|
||||
file_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
|
||||
))
|
||||
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
|
||||
))
|
||||
|
||||
# 创建 logger
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.handlers = [] # 避免重复添加 handler
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# 添加频率限制
|
||||
rate_limit_filter = RateLimitFilter()
|
||||
file_handler.addFilter(rate_limit_filter)
|
||||
console_handler.addFilter(rate_limit_filter)
|
||||
|
||||
|
||||
# 运行示例
|
||||
if __name__ == "__main__":
|
||||
setup_logging()
|
||||
|
||||
for i in range(1000):
|
||||
logging.info("测试日志,检测频率限制")
|
||||
time.sleep(0.01) # 模拟快速写入日志
|
||||
294
javdb/src/fetch.py
Normal file
294
javdb/src/fetch.py
Normal file
@ -0,0 +1,294 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import argparse
|
||||
import logging
|
||||
from functools import partial
|
||||
import config
|
||||
import sqlite_utils as db_tools
|
||||
import scraper
|
||||
import utils
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
debug = False
|
||||
force = False
|
||||
|
||||
# 获取演员列表
|
||||
def fetch_actor_list():
|
||||
next_url = scraper.actors_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
# 写入数据库
|
||||
for row in list_data:
|
||||
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
|
||||
if actor_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 获取makers列表
|
||||
def fetch_makers_list():
|
||||
next_url = scraper.makers_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
# 写入数据库
|
||||
for row in list_data:
|
||||
maker_id = db_tools.insert_or_update_makers(row)
|
||||
if maker_id:
|
||||
logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 获取series列表
|
||||
def fetch_series_list():
|
||||
next_url = scraper.series_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
# 写入数据库
|
||||
for row in list_data:
|
||||
maker_id = db_tools.insert_or_update_series(row)
|
||||
if maker_id:
|
||||
logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
|
||||
# 更新makers列表中的影片信息
|
||||
def fetch_movies_by_maker():
|
||||
url_list = db_tools.query_maker_hrefs()
|
||||
if debug:
|
||||
url_list = db_tools.query_maker_hrefs(name='muramura')
|
||||
for url in url_list:
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for maker url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 更新series列表中的影片信息
|
||||
def fetch_movies_by_series():
|
||||
url_list = db_tools.query_series_hrefs()
|
||||
if debug:
|
||||
url_list = db_tools.query_series_hrefs(name='10musume')
|
||||
for url in url_list:
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for series url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
|
||||
|
||||
# 更新演员信息
|
||||
def fetch_performers_detail():
|
||||
perfomers_list = []
|
||||
while True:
|
||||
# 每次从数据库中取一部分,避免一次全量获取
|
||||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=100)
|
||||
if len(perfomers_list) < 1:
|
||||
logging.info(f'all performers fetched.')
|
||||
break
|
||||
for performer in perfomers_list:
|
||||
url = performer['href']
|
||||
person = performer['name']
|
||||
pic = ''
|
||||
alias = []
|
||||
|
||||
next_url = url
|
||||
all_movies = []
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
if soup:
|
||||
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
||||
if data:
|
||||
pic = data.get('pic', '')
|
||||
alias = data.get('alias', [])
|
||||
all_movies.extend(data.get('movies', []))
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||
|
||||
# 获取完了个人的所有影片,开始插入数据
|
||||
performer_id = db_tools.insert_or_update_actor({
|
||||
'href': url,
|
||||
'name': person,
|
||||
'pic' : pic,
|
||||
'alias' : alias,
|
||||
'credits':all_movies
|
||||
})
|
||||
if performer_id:
|
||||
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert person: ({person}) {url} failed.')
|
||||
# 调试break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 更新影片信息
|
||||
def fetch_movies_detail():
|
||||
movies_list = []
|
||||
while True:
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100)
|
||||
if len(movies_list) < 1:
|
||||
logging.info(f'all movies fetched.')
|
||||
break
|
||||
for movie in movies_list:
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.info(f"Fetching data for movie ({title}), url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_movie_detail(soup, url, title)
|
||||
if movie_data :
|
||||
movie_id = db_tools.insert_or_update_movie(movie_data)
|
||||
if movie_id:
|
||||
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
time.sleep(1)
|
||||
# 调试增加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"actor_list": fetch_actor_list,
|
||||
"maker_list": fetch_makers_list,
|
||||
"series_list": fetch_series_list,
|
||||
"makers": fetch_movies_by_maker,
|
||||
"series" : fetch_movies_by_series,
|
||||
"movies" : fetch_movies_detail,
|
||||
"actors" : fetch_performers_detail,
|
||||
}
|
||||
|
||||
# 主函数
|
||||
def main(cmd, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
|
||||
# 开启任务
|
||||
task_id = db_tools.insert_task_log()
|
||||
if task_id is None:
|
||||
logging.warning(f'insert task log error.')
|
||||
return None
|
||||
|
||||
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
|
||||
|
||||
# 执行指定的函数
|
||||
if cmd:
|
||||
function_names = args.cmd.split(",") # 拆分输入
|
||||
for short_name in function_names:
|
||||
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {short_name}')
|
||||
func()
|
||||
else:
|
||||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||||
else: # 全量执行
|
||||
for name, func in function_map.items():
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {name}')
|
||||
func()
|
||||
else:
|
||||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
db_tools.finalize_task_log(task_id)
|
||||
|
||||
# TODO:
|
||||
# 1,
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
keys_str = ",".join(function_map.keys())
|
||||
|
||||
parser = argparse.ArgumentParser(description='fetch javdb data.')
|
||||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.cmd, args.debug, args.force)
|
||||
504
javdb/src/scraper.py
Normal file
504
javdb/src/scraper.py
Normal file
@ -0,0 +1,504 @@
|
||||
import cloudscraper
|
||||
import time
|
||||
import json
|
||||
import csv
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import RequestException
|
||||
from functools import partial
|
||||
import config
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.javdb.com"
|
||||
actors_uncensored_base_url = f'{host_url}/actors/uncensored'
|
||||
series_uncensored_base_url = f'{host_url}/series/uncensored'
|
||||
makers_uncensored_base_url = f'{host_url}/makers/uncensored'
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
if 'javdb.com' not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None, None
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.warning(f"Page not found (404): {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup, response.status_code
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
|
||||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||
def preprocess_html(html):
|
||||
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||||
|
||||
# 通用的 HTML 结构验证器
|
||||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||
if attr_type == "id":
|
||||
return soup.find(tag, id=identifier) is not None
|
||||
elif attr_type == "class":
|
||||
return bool(soup.find_all(tag, class_=identifier))
|
||||
elif attr_type == "name":
|
||||
return bool(soup.find('select', {'name': identifier}))
|
||||
return False
|
||||
|
||||
# 解析链接中的页码
|
||||
def url_page_num(href):
|
||||
if href is None:
|
||||
return None
|
||||
match = re.search(r'page=(\d+)', href)
|
||||
if match:
|
||||
next_page_number = int(match.group(1))
|
||||
return next_page_number
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# <span class="avatar" style="background-image: url(https://c0.jdbstatic.com/avatars/md/mdRn.jpg)"></span>
|
||||
def parse_avatar_image(soup):
|
||||
try:
|
||||
span = soup.find("span", class_="avatar")
|
||||
if not span:
|
||||
return "" # 没有找到 <span> 元素,返回空字符串
|
||||
|
||||
style = span.get("style", "")
|
||||
match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
|
||||
return match.group(1) if match else "" # 解析成功返回 URL,否则返回空字符串
|
||||
except Exception as e:
|
||||
return "" # 发生异常时,返回空字符串
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_actors_uncensored(soup, href):
|
||||
div_actors = soup.find("div", id='actors')
|
||||
if not div_actors:
|
||||
logging.warning(f"Warning: No actors div found ")
|
||||
return None, None
|
||||
|
||||
# 解析元素
|
||||
rows = div_actors.find_all('div', class_='box actor-box')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
# 获取演员详情链接
|
||||
actor_link = row.find('a')['href']
|
||||
# 获取演员名字
|
||||
actor_name = row.find('strong').text.strip()
|
||||
# 获取头像图片链接
|
||||
avatar_url = row.find('img', class_='avatar')['src']
|
||||
# 获取 title 属性中的别名
|
||||
alias_list = row.find('a')['title'].split(", ")
|
||||
|
||||
list_data.append({
|
||||
'name' : actor_name,
|
||||
'href' : host_url + actor_link if actor_link else '',
|
||||
'pic' : avatar_url,
|
||||
'alias': alias_list
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = url_page_num(next_page_url)
|
||||
current_page_number = url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_actor_detail(soup, href):
|
||||
# 先找一下别名
|
||||
alias_list = []
|
||||
|
||||
div_meta = soup.find('span', class_='actor-section-name')
|
||||
if not div_meta:
|
||||
logging.warning(f'warning: no meta data found in page {href}')
|
||||
return None, None
|
||||
alias_div = soup.find('div', class_='column section-title')
|
||||
|
||||
if alias_div:
|
||||
meta_list = alias_div.find_all('span', class_='section-meta')
|
||||
if len(meta_list) > 1:
|
||||
alias_list = meta_list[0].text.strip().split(", ")
|
||||
|
||||
# 头像
|
||||
pic = ''
|
||||
avatar = soup.find("div", class_="column actor-avatar")
|
||||
if avatar:
|
||||
pic = parse_avatar_image(avatar)
|
||||
|
||||
# 返回数据
|
||||
actor = {}
|
||||
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return None, None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
link = row.find('a', class_='box')['href']
|
||||
serial_number = row.find('strong').text.strip()
|
||||
title = row.find('div', class_='video-title').text.strip()
|
||||
release_date = row.find('div', class_='meta').text.strip()
|
||||
list_data.append({
|
||||
'href' : host_url + link if link else '',
|
||||
'serial_number' : serial_number,
|
||||
'title' : title,
|
||||
'release_date': release_date
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = url_page_num(next_page_url)
|
||||
current_page_number = url_page_num(href)
|
||||
logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
actor = {
|
||||
'pic' : pic,
|
||||
'alias' : alias_list,
|
||||
'movies' : list_data
|
||||
}
|
||||
|
||||
return actor, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_movie_detail(soup, href, title):
|
||||
div_video = soup.find("div", class_='video-meta-panel')
|
||||
if not div_video:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return None, None
|
||||
|
||||
# 获取封面图片
|
||||
cover_img = soup.select_one('.column-video-cover a')
|
||||
cover_url = cover_img['href'] if cover_img else None
|
||||
|
||||
# 获取番号
|
||||
serial = soup.select_one('.panel-block:first-child .value')
|
||||
serial_number = serial.text.strip() if serial else None
|
||||
|
||||
# 获取日期
|
||||
date = soup.select_one('.panel-block:nth-of-type(2) .value')
|
||||
release_date = date.text.strip() if date else None
|
||||
|
||||
# 获取时长
|
||||
duration = soup.select_one('.panel-block:nth-of-type(3) .value')
|
||||
video_duration = duration.text.strip() if duration else None
|
||||
|
||||
# 获取片商
|
||||
maker = soup.select_one('.panel-block:nth-of-type(4) .value a')
|
||||
maker_name = maker.text.strip() if maker else None
|
||||
maker_link = maker['href'] if maker else None
|
||||
|
||||
# 获取系列
|
||||
series = soup.select_one('.panel-block:nth-of-type(5) .value a')
|
||||
series_name = series.text.strip() if series else None
|
||||
series_link = series['href'] if series else None
|
||||
|
||||
# 获取演员(名字 + 链接)
|
||||
actors = [{'name': actor.text.strip(), 'href': host_url + actor['href']} for actor in soup.select('.panel-block:nth-of-type(8) .value a')]
|
||||
|
||||
return {
|
||||
'href' : href,
|
||||
'title' : title,
|
||||
'cover_url': cover_url,
|
||||
'serial_number': serial_number,
|
||||
'release_date': release_date,
|
||||
'duration': video_duration,
|
||||
'maker_name': maker_name,
|
||||
'maker_link': host_url + maker_link if maker_link else '',
|
||||
'series_name': series_name,
|
||||
'series_link': host_url + series_link if series_link else '',
|
||||
'actors': actors
|
||||
}
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_series_uncensored(soup, href):
|
||||
div_series = soup.find("div", id='series')
|
||||
if not div_series:
|
||||
logging.warning(f"Warning: No div_series div found ")
|
||||
return None, None
|
||||
|
||||
# 解析元素
|
||||
rows = div_series.find_all('a', class_='box')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
name = row.find('strong').text.strip()
|
||||
href = row['href']
|
||||
div_movies = row.find('span')
|
||||
movies = 0
|
||||
if div_movies:
|
||||
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
||||
if match:
|
||||
movies = int(match.group(1))
|
||||
|
||||
list_data.append({
|
||||
'name' : name,
|
||||
'href' : host_url + href if href else '',
|
||||
'movies' : movies
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = url_page_num(next_page_url)
|
||||
current_page_number = url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_series_detail(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
link = row.find('a', class_='box')['href']
|
||||
serial_number = row.find('strong').text.strip()
|
||||
title = row.find('div', class_='video-title').text.strip()
|
||||
release_date = row.find('div', class_='meta').text.strip()
|
||||
list_data.append({
|
||||
'href' : host_url + link if link else '',
|
||||
'serial_number' : serial_number,
|
||||
'title' : title,
|
||||
'release_date': release_date
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = url_page_num(next_page_url)
|
||||
current_page_number = url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_makers_uncensored(soup, href):
|
||||
div_series = soup.find("div", id='makers')
|
||||
if not div_series:
|
||||
logging.warning(f"Warning: No makers div found ")
|
||||
return None, None
|
||||
|
||||
# 解析元素
|
||||
rows = div_series.find_all('a', class_='box')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
name = row.find('strong').text.strip()
|
||||
href = row['href']
|
||||
div_movies = row.find('span')
|
||||
movies = 0
|
||||
if div_movies:
|
||||
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
||||
if match:
|
||||
movies = int(match.group(1))
|
||||
|
||||
list_data.append({
|
||||
'name' : name,
|
||||
'href' : host_url + href if href else '',
|
||||
'movies' : movies
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = url_page_num(next_page_url)
|
||||
current_page_number = url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_maker_detail(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
link = row.find('a', class_='box')['href']
|
||||
serial_number = row.find('strong').text.strip()
|
||||
title = row.find('div', class_='video-title').text.strip()
|
||||
release_date = row.find('div', class_='meta').text.strip()
|
||||
list_data.append({
|
||||
'href' : host_url + link if link else '',
|
||||
'serial_number' : serial_number,
|
||||
'title' : title,
|
||||
'release_date': release_date
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = url_page_num(next_page_url)
|
||||
current_page_number = url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
|
||||
###### 以下为测试代码 ######
|
||||
def test_actors_list():
|
||||
next_url = actors_uncensored_base_url
|
||||
while next_url:
|
||||
print(f'fetching page {next_url}')
|
||||
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = parse_actors_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
print(list_data)
|
||||
else:
|
||||
print('get wrong page.')
|
||||
if next_url:
|
||||
print(next_url)
|
||||
break
|
||||
|
||||
def test_actor():
|
||||
next_url = 'https://javdb.com/actors/mdRn'
|
||||
all_data = []
|
||||
while next_url:
|
||||
print(f'fetching page {next_url}')
|
||||
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = parse_actor_detail(soup, next_url)
|
||||
if list_data :
|
||||
all_data.extend(list_data)
|
||||
else:
|
||||
print('get wrong page.')
|
||||
print(all_data)
|
||||
|
||||
def test_movie_detail():
|
||||
movie_url = 'https://javdb.com/v/gB2Q7'
|
||||
while True:
|
||||
soup = fetch_page(movie_url, partial(generic_validator, tag="div", identifier="video-detail", attr_type="class"))
|
||||
if soup:
|
||||
detail = parse_movie_detail(soup, movie_url, 'RED193 無碼 レッドホットフェティッシュコレクション 中出し120連発 4 : 波多野結衣, 愛乃なみ, 夢実あくび, 他多数')
|
||||
if detail:
|
||||
print(detail)
|
||||
break
|
||||
|
||||
|
||||
def test_series_list():
|
||||
next_url = 'https://javdb.com/series/uncensored'
|
||||
all_data = []
|
||||
while next_url:
|
||||
print(f'fetching page {next_url}')
|
||||
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = parse_series_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
all_data.extend(list_data)
|
||||
else:
|
||||
print('get wrong page.')
|
||||
break
|
||||
|
||||
print(all_data)
|
||||
|
||||
def test_series_detail():
|
||||
next_url = 'https://javdb.com/series/39za'
|
||||
all_data = []
|
||||
while next_url:
|
||||
print(f'fetching page {next_url}')
|
||||
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = parse_series_detail(soup, next_url)
|
||||
if list_data :
|
||||
all_data.extend(list_data)
|
||||
else:
|
||||
print('get wrong page.')
|
||||
print(all_data)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
#test_actors_list()
|
||||
#test_actor()
|
||||
test_movie_detail()
|
||||
#test_series_list()
|
||||
#test_series_detail()
|
||||
|
||||
|
||||
599
javdb/src/sqlite_utils.py
Normal file
599
javdb/src/sqlite_utils.py
Normal file
@ -0,0 +1,599 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import config
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
# 连接 SQLite 数据库
|
||||
DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
|
||||
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# """从指定表中通过 href 查找 id"""
|
||||
def get_id_by_href(table: str, href: str) -> int:
|
||||
if href is None:
|
||||
return None
|
||||
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
|
||||
def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
|
||||
try:
|
||||
# **查询是否已存在该演员**
|
||||
cursor.execute("SELECT id, name, from_actor_list, from_movie_list FROM javdb_actors WHERE href = ?", (href,))
|
||||
existing_actor = cursor.fetchone()
|
||||
|
||||
if existing_actor: # **如果演员已存在**
|
||||
actor_id, existing_name, existing_actor_list, existing_movie_list = existing_actor
|
||||
|
||||
# **如果没有传入值,则保持原有值**
|
||||
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor_list
|
||||
from_movie_list = from_movie_list if from_movie_list is not None else existing_movie_list
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE javdb_actors
|
||||
SET name = ?,
|
||||
from_actor_list = ?,
|
||||
from_movie_list = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (name, from_actor_list, from_movie_list, href))
|
||||
else: # **如果演员不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_actors (href, name, from_actor_list, from_movie_list)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (href, name, from_actor_list, from_movie_list))
|
||||
|
||||
conn.commit()
|
||||
|
||||
performer_id = get_id_by_href('javdb_actors', href)
|
||||
if performer_id:
|
||||
logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None):
|
||||
try:
|
||||
# **先检查数据库中是否已有该电影**
|
||||
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series FROM javdb_movies WHERE href = ?", (href,))
|
||||
existing_movie = cursor.fetchone()
|
||||
|
||||
if existing_movie: # **如果电影已存在**
|
||||
movie_id, existing_actor, existing_maker, existing_series = existing_movie
|
||||
|
||||
# **如果没有传入值,就用原来的值**
|
||||
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor
|
||||
from_movie_makers = from_movie_makers if from_movie_makers is not None else existing_maker
|
||||
from_movie_series = from_movie_series if from_movie_series is not None else existing_series
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE javdb_movies
|
||||
SET title = ?,
|
||||
from_actor_list = ?,
|
||||
from_movie_makers = ?,
|
||||
from_movie_series = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (title, from_actor_list, from_movie_makers, from_movie_series, href))
|
||||
else: # **如果电影不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (title, href, from_actor_list, from_movie_makers, from_movie_series))
|
||||
|
||||
conn.commit()
|
||||
|
||||
movie_id = get_id_by_href('javdb_movies', href)
|
||||
if movie_id:
|
||||
logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"Error inserting/updating movie: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 插入演员和电影的关联数据
|
||||
def insert_actor_movie(performer_id, movie_id, tags=''):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_actors_movies (actor_id, movie_id, tags)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT(actor_id, movie_id) DO UPDATE SET tags=excluded.tags
|
||||
""",
|
||||
(performer_id, movie_id, tags)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
# 插入演员数据
|
||||
def insert_or_update_actor(actor):
|
||||
try:
|
||||
cursor.execute('''
|
||||
INSERT INTO javdb_actors (name, href, pic, is_full_data, updated_at)
|
||||
VALUES (?, ?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET name=excluded.name, pic=excluded.pic, is_full_data=1, updated_at=datetime('now', 'localtime')
|
||||
''', (actor['name'], actor['href'], actor['pic']))
|
||||
|
||||
cursor.execute('SELECT id FROM javdb_actors WHERE href = ?', (actor['href'],))
|
||||
conn.commit()
|
||||
|
||||
actor_id = get_id_by_href('javdb_actors', actor['href'])
|
||||
if actor_id is None:
|
||||
logging.warning(f'insert data error. name: {actor['name']}, href: {actor['href']}')
|
||||
return None
|
||||
|
||||
logging.debug(f'insert one actor, id: {actor_id}, name: {actor['name']}, href: {actor['href']}')
|
||||
|
||||
# 插入别名
|
||||
for alias in actor.get("alias") or []:
|
||||
cursor.execute('''
|
||||
INSERT OR IGNORE INTO javdb_actors_alias (actor_id, alias, updated_at)
|
||||
VALUES (?, ?, datetime('now', 'localtime'))
|
||||
''', (actor_id, alias))
|
||||
|
||||
conn.commit()
|
||||
|
||||
# 插入影片列表
|
||||
for movie in actor.get("credits") or []:
|
||||
movie_id = get_id_by_href('javdb_movies', movie['href'])
|
||||
# 影片不存在,先插入
|
||||
if movie_id is None:
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1)
|
||||
if movie_id:
|
||||
tmp_id = insert_actor_movie(actor_id, movie_id)
|
||||
if tmp_id :
|
||||
logging.debug(f'insert one performer_movie, performer_id: {actor_id}, movie_id: {movie_id}')
|
||||
else:
|
||||
logging.warning(f'insert performer_movie failed. performer_id: {actor_id}, moive href: {movie['href']}')
|
||||
|
||||
return actor_id
|
||||
except Exception as e:
|
||||
logging.error(f"插入/更新演员 {actor['name']} 失败: {e}")
|
||||
conn.rollback()
|
||||
|
||||
# 删除演员
|
||||
def delete_actor_by_href(href):
|
||||
try:
|
||||
cursor.execute('DELETE FROM javdb_actors WHERE href = ?', (href,))
|
||||
conn.commit()
|
||||
logging.info(f"成功删除演员: {href}")
|
||||
except Exception as e:
|
||||
logging.error(f"删除演员 {href} 失败: {e}")
|
||||
conn.rollback()
|
||||
|
||||
# 查询
|
||||
def query_actors(**filters):
|
||||
try:
|
||||
sql = "SELECT href, name FROM javdb_actors WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
if "is_full_data" in filters:
|
||||
sql += " AND is_full_data = ?"
|
||||
params.append(filters["is_full_data"])
|
||||
if 'limit' in filters:
|
||||
sql += " limit ?"
|
||||
params.append(filters["limit"])
|
||||
|
||||
cursor.execute(sql, params)
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 插入或更新发行商 """
|
||||
def insert_or_update_makers(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_makers (name, href, updated_at)
|
||||
VALUES (?, ? , datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM javdb_makers WHERE href = ?", (data["href"],))
|
||||
dist_id = cursor.fetchone()[0]
|
||||
if dist_id:
|
||||
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||
return dist_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# 删除发行商(按 id 或 name) """
|
||||
def delete_maker(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM javdb_makers WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM javdb_makers WHERE name = ?", (identifier,))
|
||||
conn.commit()
|
||||
logging.info(f"成功删除发行商: {identifier}")
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# 查询发行商(按 id 或 name) """
|
||||
def query_maker(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM javdb_makers WHERE id = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM javdb_makers WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
distributor = cursor.fetchone()
|
||||
if distributor:
|
||||
return dict(zip([desc[0] for desc in cursor.description], distributor))
|
||||
else:
|
||||
logging.warning(f"未找到发行商: {identifier}")
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_maker_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM javdb_makers WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "url" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# """ 插入或更新制作公司 """
|
||||
def insert_or_update_series(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_series (name, href, updated_at)
|
||||
VALUES (?, ?, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM javdb_series WHERE href = ?", (data["href"],))
|
||||
stu_id = cursor.fetchone()[0]
|
||||
if stu_id:
|
||||
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||
return stu_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# """ 删除制作公司(按 id 或 name) """
|
||||
def delete_series(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM javdb_series WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM javdb_series WHERE name = ?", (identifier,))
|
||||
conn.commit()
|
||||
logging.info(f"成功删除制作公司: {identifier}")
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# """ 查询制作公司(按 id 或 name) """
|
||||
def query_series(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM javdb_series WHERE id = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM javdb_series WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
studio = cursor.fetchone()
|
||||
if studio:
|
||||
return dict(zip([desc[0] for desc in cursor.description], studio))
|
||||
else:
|
||||
logging.warning(f"未找到制作公司: {identifier}")
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_series_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM javdb_series WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# """插入或更新电影数据"""
|
||||
def insert_or_update_movie(movie):
|
||||
try:
|
||||
# 获取相关 ID
|
||||
makers_id = get_id_by_href('javdb_makers', movie['maker_link'])
|
||||
series_id = get_id_by_href('javdb_series', movie['series_link'])
|
||||
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
|
||||
maker_id, series_id, is_full_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
title=excluded.title,
|
||||
cover_url=excluded.cover_url,
|
||||
serial_number=excluded.serial_number,
|
||||
release_date=excluded.release_date,
|
||||
duration=excluded.duration,
|
||||
maker_id=excluded.maker_id,
|
||||
series_id=excluded.series_id,
|
||||
is_full_data=1,
|
||||
updated_at=datetime('now', 'localtime')
|
||||
""", (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'],
|
||||
movie['release_date'], movie['duration'], makers_id, series_id))
|
||||
|
||||
conn.commit()
|
||||
|
||||
# 获取插入的 movie_id
|
||||
movie_id = get_id_by_href('javdb_movies', movie['href'])
|
||||
if movie_id is None:
|
||||
return None
|
||||
|
||||
logging.debug(f'insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
|
||||
# 插入 performers_movies 关系表
|
||||
for performer in movie.get('actors', []):
|
||||
performer_id = get_id_by_href('javdb_actors', performer['href'])
|
||||
# 如果演员不存在,先插入
|
||||
if performer_id is None:
|
||||
performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
|
||||
if performer_id:
|
||||
tmp_id = insert_actor_movie(performer_id, movie_id)
|
||||
if tmp_id:
|
||||
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
|
||||
else:
|
||||
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
|
||||
else:
|
||||
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
|
||||
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
# 删除电影数据"""
|
||||
def delete_movie(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM javdb_movies WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM javdb_movies WHERE href = ?", (identifier,))
|
||||
else:
|
||||
logging.warning("无效的删除参数")
|
||||
return
|
||||
conn.commit()
|
||||
logging.info(f"Deleted movie with {identifier}")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error("Error deleting movie: %s", e)
|
||||
|
||||
# 查找电影数据"""
|
||||
def query_movies(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM javdb_movies WHERE id = ?", (identifier,))
|
||||
elif "http" in identifier:
|
||||
cursor.execute("SELECT * FROM javdb_movies WHERE href = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM javdb_movies WHERE title LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
movie = cursor.fetchone()
|
||||
if movie:
|
||||
cursor.execute("SELECT * FROM javdb_actors_movies WHERE performer_id = ?", (movie[0],))
|
||||
performers = [row[0] for row in cursor.fetchall()]
|
||||
result = dict(zip([desc[0] for desc in cursor.description], performers))
|
||||
result["performers"] = performers
|
||||
return result
|
||||
else:
|
||||
logging.warning(f"find no data: {identifier}")
|
||||
return None
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_movie_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href, title FROM javdb_movies WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "title" in filters:
|
||||
sql += " AND title LIKE ?"
|
||||
params.append(f"%{filters['title']}%")
|
||||
if "is_full_data" in filters:
|
||||
sql += " AND is_full_data = ?"
|
||||
params.append(filters["is_full_data"])
|
||||
if 'limit' in filters:
|
||||
sql += " limit ?"
|
||||
params.append(filters["limit"])
|
||||
|
||||
cursor.execute(sql, params)
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return []
|
||||
|
||||
# 插入一条任务日志
|
||||
def insert_task_log():
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_task_log (task_status) VALUES ('Start')
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
task_id = cursor.lastrowid
|
||||
if task_id is None:
|
||||
return None
|
||||
update_task_log(task_id=task_id, task_status='Start')
|
||||
|
||||
return task_id # 获取插入的 task_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"插入任务失败: {e}")
|
||||
return None
|
||||
|
||||
# 更新任务日志的字段
|
||||
def update_task_log_inner(task_id, **kwargs):
|
||||
try:
|
||||
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
|
||||
params = list(kwargs.values()) + [task_id]
|
||||
|
||||
sql = f"UPDATE javdb_task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
|
||||
cursor.execute(sql, params)
|
||||
conn.commit()
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||
|
||||
# 更新任务日志的字段
|
||||
def update_task_log(task_id, task_status):
|
||||
try:
|
||||
# 获取 performers、studios 等表的最终行数
|
||||
cursor.execute("SELECT COUNT(*) FROM javdb_actors where is_full_data=1")
|
||||
full_data_actors = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM javdb_actors")
|
||||
total_actors = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM javdb_movies where is_full_data=1")
|
||||
full_data_movies = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM javdb_movies")
|
||||
total_movies = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM javdb_makers")
|
||||
total_makers = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM javdb_series")
|
||||
total_series = cursor.fetchone()[0]
|
||||
|
||||
# 更新 task_log
|
||||
update_task_log_inner(task_id,
|
||||
full_data_actors=full_data_actors,
|
||||
total_actors=total_actors,
|
||||
full_data_movies=full_data_movies,
|
||||
total_movies=total_movies,
|
||||
total_makers=total_makers,
|
||||
total_series=total_series,
|
||||
task_status=task_status)
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||
|
||||
|
||||
# 任务结束,更新字段
|
||||
def finalize_task_log(task_id):
|
||||
try:
|
||||
# 更新 task_log
|
||||
update_task_log(task_id, task_status="Success")
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"任务 {task_id} 结束失败: {e}")
|
||||
|
||||
|
||||
# 测试代码
|
||||
if __name__ == "__main__":
|
||||
|
||||
sample_data = [
|
||||
{
|
||||
'name': '上原亜衣',
|
||||
'href': 'https://www.javdb.com/actors/MkAX',
|
||||
'pic': 'https://c0.jdbstatic.com/avatars/mk/MkAX.jpg',
|
||||
'alias': ['上原亜衣', '下原舞', '早瀬クリスタル', '阿蘇山百式屏風奉行']
|
||||
},
|
||||
{
|
||||
'name': '大橋未久',
|
||||
'href': 'https://www.javdb.com/actors/21Jp',
|
||||
'pic': 'https://c0.jdbstatic.com/avatars/21/21Jp.jpg',
|
||||
'alias': ['大橋未久']
|
||||
},
|
||||
]
|
||||
|
||||
for actor in sample_data:
|
||||
insert_or_update_actor(actor)
|
||||
|
||||
print(query_actors("name LIKE '%未久%'"))
|
||||
#delete_actor_by_href('https://www.javdb.com/actors/MkAX')
|
||||
print(query_actors())
|
||||
18
javdb/src/utils.py
Normal file
18
javdb/src/utils.py
Normal file
@ -0,0 +1,18 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
from urllib.parse import urlparse
|
||||
import logging
|
||||
|
||||
|
||||
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
|
||||
def remove_url_query(url: str) -> str:
|
||||
try:
|
||||
parsed_url = urlparse(url)
|
||||
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
||||
return clean_url
|
||||
except Exception as e:
|
||||
print(f"解析 URL 失败: {e}")
|
||||
return url
|
||||
Reference in New Issue
Block a user