This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/javdb/src/utils.py
2025-04-23 19:18:06 +08:00

119 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import os
import json
import time
import csv
from datetime import datetime
from urllib.parse import urlparse
import logging
import config
update_dir = f'{config.global_host_data_dir}/javdb'
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 只提取movies url
def extract_id_from_href(href):
# 检查 URL 是否符合要求
if 'javdb.com/v/' in href:
# 定义正则表达式模式
pattern = r'javdb.com/v/([^?&]+)'
# 查找匹配项
match = re.search(pattern, href)
if match:
# 提取匹配的字符串并转换为小写
result = match.group(1).lower()
return result
return ''
# 保存抓取到的原始HTML方便后续核验
def write_raw_html(href, html_text):
# 获取目录
id = extract_id_from_href(href)
if 'javdb.com/v/' in href.lower():
dir_prefix = 'raw_movies'
else:
return
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
file_name = f"{id}.html" # 用 - 替换空格
full_path = os.path.join(file_dir, file_name)
try:
with open(full_path, 'w', encoding='utf-8') as file:
file.write(html_text)
except FileNotFoundError:
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
except PermissionError:
logging.warning(f"错误:没有权限写入文件 {full_path}")
except Exception as e:
logging.warning(f"发生未知错误:{e}")
# 保存抓取到的原始HTML方便后续核验
def read_raw_html(href, expire_date_str="2025-03-01"):
# 获取目录
id = extract_id_from_href(href)
if 'javdb.com/v/' in href.lower():
dir_prefix = 'raw_movies'
else:
return
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
file_name = f"{id}.html" # 用 - 替换空格
full_path = os.path.join(file_dir, file_name)
try:
if os.path.exists(full_path):
# 获取文件的最后修改时间
last_modified_timestamp = os.path.getmtime(full_path)
# 将时间戳转换为 datetime 对象
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
# 检查文件最后修改时间是否晚于给定日期
expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
if last_modified_date > expire_date:
logging.debug(f"find local file on href {href}")
with open(full_path, 'r', encoding='utf-8') as file:
return file.read()
else:
logging.debug(f"expired file {last_modified_date} on href {href}")
return None
else:
return None
except FileNotFoundError:
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
except PermissionError:
logging.warning(f"错误:没有权限读取文件 {full_path}")
except Exception as e:
logging.warning(f"发生未知错误:{e}")
return None
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
def remove_url_query(url: str) -> str:
try:
parsed_url = urlparse(url)
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
return clean_url
except Exception as e:
print(f"解析 URL 失败: {e}")
return url
# 写csv文件
def json_to_csv(data, output_file):
if not data:
return
headers = list(data[0].keys())
with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
for row in data:
writer.writerow(row)