modify scripts

This commit is contained in:
oscarz
2025-06-24 10:02:28 +08:00
parent 882ee5047a
commit 12c53b043d
8 changed files with 2569 additions and 0 deletions

167
src/utils/utils.py Normal file
View File

@ -0,0 +1,167 @@
import re
import os
import json
import time
import csv
from datetime import datetime
from urllib.parse import urlparse
import logging
import src.config.config as config
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
update_dir = f'{config.global_host_data_dir}/javdb'
def is_valid_url(url: str) -> bool:
"""检查 URL 是否合法"""
try:
result = urlparse(url)
# 验证是否包含 scheme如 http/https和 netloc如 example.com
return all([result.scheme, result.netloc])
except ValueError:
return False
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 只提取movies url
def extract_id_from_href(href):
# 检查 URL 是否符合要求
if 'javdb.com/v/' in href:
# 定义正则表达式模式
pattern = r'javdb.com/v/([^?&]+)'
# 查找匹配项
match = re.search(pattern, href)
if match:
# 提取匹配的字符串并转换为小写
result = match.group(1).lower()
return result
return ''
# 保存抓取到的原始HTML方便后续核验
def write_raw_html(href, html_text):
# 获取目录
id = extract_id_from_href(href)
if 'javdb.com/v/' in href.lower():
dir_prefix = 'raw_movies'
else:
return
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
file_name = f"{id}.html" # 用 - 替换空格
full_path = os.path.join(file_dir, file_name)
try:
with open(full_path, 'w', encoding='utf-8') as file:
file.write(html_text)
except FileNotFoundError:
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
except PermissionError:
logging.warning(f"错误:没有权限写入文件 {full_path}")
except Exception as e:
logging.warning(f"发生未知错误:{e}")
# 保存抓取到的原始HTML方便后续核验
def read_raw_html(href, expire_date_str="2025-03-01"):
# 获取目录
id = extract_id_from_href(href)
if 'javdb.com/v/' in href.lower():
dir_prefix = 'raw_movies'
else:
return
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
file_name = f"{id}.html" # 用 - 替换空格
full_path = os.path.join(file_dir, file_name)
try:
if os.path.exists(full_path):
# 获取文件的最后修改时间
last_modified_timestamp = os.path.getmtime(full_path)
# 将时间戳转换为 datetime 对象
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
# 检查文件最后修改时间是否晚于给定日期
expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
if last_modified_date > expire_date:
logging.debug(f"find local file on href {href}")
with open(full_path, 'r', encoding='utf-8') as file:
return file.read()
else:
logging.debug(f"expired file {last_modified_date} on href {href}")
return None
else:
return None
except FileNotFoundError:
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
except PermissionError:
logging.warning(f"错误:没有权限读取文件 {full_path}")
except Exception as e:
logging.warning(f"发生未知错误:{e}")
return None
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
def remove_url_query(url: str) -> str:
try:
parsed_url = urlparse(url)
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
return clean_url
except Exception as e:
print(f"解析 URL 失败: {e}")
return url
# 写csv文件
def json_to_csv(data, output_file):
if not data:
return
headers = list(data[0].keys())
with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
for row in data:
writer.writerow(row)
def normalize_url(url: str) -> str:
"""
标准化URL移除语言前缀使不同语言版本的URL保持一致
示例:
https://www.javbus.com/ja/star/p8y → https://www.javbus.com/star/p8y
https://www.javbus.com/en/star/p8y → https://www.javbus.com/star/p8y
"""
try:
# 解析URL
parsed = urlparse(url)
# 提取路径部分
path = parsed.path
# 常见语言代码列表
LANGUAGES = {'ja', 'en', 'ko', 'zh', 'fr', 'de', 'es', 'ru'}
# 分割路径为组件
path_components = path.strip('/').split('/')
# 如果第一个组件是语言代码,则移除它
if path_components and path_components[0] in LANGUAGES:
path_components = path_components[1:]
# 重新构建标准化的路径
normalized_path = '/' + '/'.join(path_components)
# 构建标准化的URL保留协议和域名替换路径
normalized_url = parsed._replace(path=normalized_path).geturl()
return normalized_url
except Exception as e:
print(f"URL标准化失败: {url}, 错误: {e}")
return url # 出错时返回原始URL