import re import os import json import time import csv from datetime import datetime from urllib.parse import urlparse import logging import config update_dir = f'{config.global_host_data_dir}/javdb' # 创建目录 def create_sub_directory(base_dir, str): # 获取 person 的前两个字母并转为小写 sub_dir = str[:1].lower() full_path = os.path.join(base_dir, sub_dir) if not os.path.exists(full_path): os.makedirs(full_path) return full_path # 只提取movies url def extract_id_from_href(href): # 检查 URL 是否符合要求 if 'javdb.com/v/' in href: # 定义正则表达式模式 pattern = r'javdb.com/v/([^?&]+)' # 查找匹配项 match = re.search(pattern, href) if match: # 提取匹配的字符串并转换为小写 result = match.group(1).lower() return result return '' # 保存抓取到的原始HTML,方便后续核验 def write_raw_html(href, html_text): # 获取目录 id = extract_id_from_href(href) if 'javdb.com/v/' in href.lower(): dir_prefix = 'raw_movies' else: return file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id) file_name = f"{id}.html" # 用 - 替换空格 full_path = os.path.join(file_dir, file_name) try: with open(full_path, 'w', encoding='utf-8') as file: file.write(html_text) except FileNotFoundError: logging.warning(f"错误:指定的路径 {full_path} 不存在。") except PermissionError: logging.warning(f"错误:没有权限写入文件 {full_path}。") except Exception as e: logging.warning(f"发生未知错误:{e}") # 保存抓取到的原始HTML,方便后续核验 def read_raw_html(href, expire_date_str="2025-03-01"): # 获取目录 id = extract_id_from_href(href) if 'javdb.com/v/' in href.lower(): dir_prefix = 'raw_movies' else: return file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id) file_name = f"{id}.html" # 用 - 替换空格 full_path = os.path.join(file_dir, file_name) try: if os.path.exists(full_path): # 获取文件的最后修改时间 last_modified_timestamp = os.path.getmtime(full_path) # 将时间戳转换为 datetime 对象 last_modified_date = datetime.fromtimestamp(last_modified_timestamp) # 检查文件最后修改时间是否晚于给定日期 expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d") if last_modified_date > expire_date: logging.debug(f"find local file on href {href}") with open(full_path, 'r', encoding='utf-8') as file: return file.read() else: logging.debug(f"expired file {last_modified_date} on href {href}") return None else: return None except FileNotFoundError: logging.warning(f"错误:指定的路径 {full_path} 不存在。") except PermissionError: logging.warning(f"错误:没有权限读取文件 {full_path}。") except Exception as e: logging.warning(f"发生未知错误:{e}") return None # 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数 def remove_url_query(url: str) -> str: try: parsed_url = urlparse(url) clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" return clean_url except Exception as e: print(f"解析 URL 失败: {e}") return url # 写csv文件 def json_to_csv(data, output_file): if not data: return headers = list(data[0].keys()) with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=headers) writer.writeheader() for row in data: writer.writerow(row)