resources/javdb/src/utils.py

import re
import os
import json
import time
import csv
from datetime import datetime
from urllib.parse import urlparse
import logging
import config

update_dir = f'{config.global_host_data_dir}/javdb'

# 创建目录
def create_sub_directory(base_dir, str):
    # 获取 person 的前两个字母并转为小写
    sub_dir = str[:1].lower()
    full_path = os.path.join(base_dir, sub_dir)
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    return full_path

# 只提取movies url
def extract_id_from_href(href):
    # 检查 URL 是否符合要求
    if 'javdb.com/v/' in href:
        # 定义正则表达式模式
        pattern = r'javdb.com/v/([^?&]+)'
        # 查找匹配项
        match = re.search(pattern, href)
        if match:
            # 提取匹配的字符串并转换为小写
            result = match.group(1).lower()
            return result
    return ''

# 保存抓取到的原始HTML，方便后续核验
def write_raw_html(href, html_text):
    # 获取目录
    id = extract_id_from_href(href)
    if 'javdb.com/v/' in href.lower():
        dir_prefix = 'raw_movies'
    else:
        return

    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
    file_name = f"{id}.html"  # 用 - 替换空格
    full_path = os.path.join(file_dir, file_name)

    try:
        with open(full_path, 'w', encoding='utf-8') as file:
            file.write(html_text)
    except FileNotFoundError:
        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
    except PermissionError:
        logging.warning(f"错误：没有权限写入文件 {full_path}。")
    except Exception as e:
        logging.warning(f"发生未知错误：{e}")


# 保存抓取到的原始HTML，方便后续核验
def read_raw_html(href, expire_date_str="2025-03-01"):
    # 获取目录
    id = extract_id_from_href(href)
    if 'javdb.com/v/' in href.lower():
        dir_prefix = 'raw_movies'
    else:
        return

    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
    file_name = f"{id}.html"  # 用 - 替换空格
    full_path = os.path.join(file_dir, file_name)

    try:
        if os.path.exists(full_path):
            # 获取文件的最后修改时间
            last_modified_timestamp = os.path.getmtime(full_path)
            # 将时间戳转换为 datetime 对象
            last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
            # 检查文件最后修改时间是否晚于给定日期
            expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
            if last_modified_date > expire_date:
                logging.debug(f"find local file on href {href}")
                with open(full_path, 'r', encoding='utf-8') as file:
                    return file.read()
            else:
                logging.debug(f"expired file {last_modified_date} on href {href}")
                return None
        else:
            return None
    except FileNotFoundError:
        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
    except PermissionError:
        logging.warning(f"错误：没有权限读取文件 {full_path}。")
    except Exception as e:
        logging.warning(f"发生未知错误：{e}")
    return None


# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
def remove_url_query(url: str) -> str:
    try:
        parsed_url = urlparse(url)
        clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
        return clean_url
    except Exception as e:
        print(f"解析 URL 失败: {e}")
        return url
# 写csv文件
def json_to_csv(data, output_file):
    if not data:
        return
    headers = list(data[0].keys())
    with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        for row in data:
            writer.writerow(row)