modify scripts
This commit is contained in:
167
src/utils/utils.py
Normal file
167
src/utils/utils.py
Normal file
@ -0,0 +1,167 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
import logging
|
||||
import src.config.config as config
|
||||
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
|
||||
|
||||
update_dir = f'{config.global_host_data_dir}/javdb'
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
"""检查 URL 是否合法"""
|
||||
try:
|
||||
result = urlparse(url)
|
||||
# 验证是否包含 scheme(如 http/https)和 netloc(如 example.com)
|
||||
return all([result.scheme, result.netloc])
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
# 创建目录
|
||||
def create_sub_directory(base_dir, str):
|
||||
# 获取 person 的前两个字母并转为小写
|
||||
sub_dir = str[:1].lower()
|
||||
full_path = os.path.join(base_dir, sub_dir)
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
return full_path
|
||||
|
||||
# 只提取movies url
|
||||
def extract_id_from_href(href):
|
||||
# 检查 URL 是否符合要求
|
||||
if 'javdb.com/v/' in href:
|
||||
# 定义正则表达式模式
|
||||
pattern = r'javdb.com/v/([^?&]+)'
|
||||
# 查找匹配项
|
||||
match = re.search(pattern, href)
|
||||
if match:
|
||||
# 提取匹配的字符串并转换为小写
|
||||
result = match.group(1).lower()
|
||||
return result
|
||||
return ''
|
||||
|
||||
# 保存抓取到的原始HTML,方便后续核验
|
||||
def write_raw_html(href, html_text):
|
||||
# 获取目录
|
||||
id = extract_id_from_href(href)
|
||||
if 'javdb.com/v/' in href.lower():
|
||||
dir_prefix = 'raw_movies'
|
||||
else:
|
||||
return
|
||||
|
||||
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||||
file_name = f"{id}.html" # 用 - 替换空格
|
||||
full_path = os.path.join(file_dir, file_name)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as file:
|
||||
file.write(html_text)
|
||||
except FileNotFoundError:
|
||||
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||||
except PermissionError:
|
||||
logging.warning(f"错误:没有权限写入文件 {full_path}。")
|
||||
except Exception as e:
|
||||
logging.warning(f"发生未知错误:{e}")
|
||||
|
||||
|
||||
# 保存抓取到的原始HTML,方便后续核验
|
||||
def read_raw_html(href, expire_date_str="2025-03-01"):
|
||||
# 获取目录
|
||||
id = extract_id_from_href(href)
|
||||
if 'javdb.com/v/' in href.lower():
|
||||
dir_prefix = 'raw_movies'
|
||||
else:
|
||||
return
|
||||
|
||||
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||||
file_name = f"{id}.html" # 用 - 替换空格
|
||||
full_path = os.path.join(file_dir, file_name)
|
||||
|
||||
try:
|
||||
if os.path.exists(full_path):
|
||||
# 获取文件的最后修改时间
|
||||
last_modified_timestamp = os.path.getmtime(full_path)
|
||||
# 将时间戳转换为 datetime 对象
|
||||
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
|
||||
# 检查文件最后修改时间是否晚于给定日期
|
||||
expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
|
||||
if last_modified_date > expire_date:
|
||||
logging.debug(f"find local file on href {href}")
|
||||
with open(full_path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
else:
|
||||
logging.debug(f"expired file {last_modified_date} on href {href}")
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||||
except PermissionError:
|
||||
logging.warning(f"错误:没有权限读取文件 {full_path}。")
|
||||
except Exception as e:
|
||||
logging.warning(f"发生未知错误:{e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
|
||||
def remove_url_query(url: str) -> str:
|
||||
try:
|
||||
parsed_url = urlparse(url)
|
||||
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
||||
return clean_url
|
||||
except Exception as e:
|
||||
print(f"解析 URL 失败: {e}")
|
||||
return url
|
||||
# 写csv文件
|
||||
def json_to_csv(data, output_file):
|
||||
if not data:
|
||||
return
|
||||
headers = list(data[0].keys())
|
||||
with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=headers)
|
||||
writer.writeheader()
|
||||
for row in data:
|
||||
writer.writerow(row)
|
||||
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""
|
||||
标准化URL,移除语言前缀,使不同语言版本的URL保持一致
|
||||
|
||||
示例:
|
||||
https://www.javbus.com/ja/star/p8y → https://www.javbus.com/star/p8y
|
||||
https://www.javbus.com/en/star/p8y → https://www.javbus.com/star/p8y
|
||||
"""
|
||||
try:
|
||||
# 解析URL
|
||||
parsed = urlparse(url)
|
||||
|
||||
# 提取路径部分
|
||||
path = parsed.path
|
||||
|
||||
# 常见语言代码列表
|
||||
LANGUAGES = {'ja', 'en', 'ko', 'zh', 'fr', 'de', 'es', 'ru'}
|
||||
|
||||
# 分割路径为组件
|
||||
path_components = path.strip('/').split('/')
|
||||
|
||||
# 如果第一个组件是语言代码,则移除它
|
||||
if path_components and path_components[0] in LANGUAGES:
|
||||
path_components = path_components[1:]
|
||||
|
||||
# 重新构建标准化的路径
|
||||
normalized_path = '/' + '/'.join(path_components)
|
||||
|
||||
# 构建标准化的URL(保留协议和域名,替换路径)
|
||||
normalized_url = parsed._replace(path=normalized_path).geturl()
|
||||
|
||||
return normalized_url
|
||||
|
||||
except Exception as e:
|
||||
print(f"URL标准化失败: {url}, 错误: {e}")
|
||||
return url # 出错时返回原始URL
|
||||
Reference in New Issue
Block a user