modify scripts

2025-06-24 10:02:28 +08:00
parent 882ee5047a
commit 12c53b043d
8 changed files with 2569 additions and 0 deletions
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@ -0,0 +1,167 @@
+import re
+import os
+import json
+import time
+import csv
+from datetime import datetime
+from urllib.parse import urlparse
+import logging
+import src.config.config as config
+from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
+
+update_dir = f'{config.global_host_data_dir}/javdb'
+
+def is_valid_url(url: str) -> bool:
+    """检查 URL 是否合法"""
+    try:
+        result = urlparse(url)
+        # 验证是否包含 scheme（如 http/https）和 netloc（如 example.com）
+        return all([result.scheme, result.netloc])
+    except ValueError:
+        return False
+
+# 创建目录
+def create_sub_directory(base_dir, str):
+    # 获取 person 的前两个字母并转为小写
+    sub_dir = str[:1].lower()
+    full_path = os.path.join(base_dir, sub_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 只提取movies url
+def extract_id_from_href(href):
+    # 检查 URL 是否符合要求
+    if 'javdb.com/v/' in href:
+        # 定义正则表达式模式
+        pattern = r'javdb.com/v/([^?&]+)'
+        # 查找匹配项
+        match = re.search(pattern, href)
+        if match:
+            # 提取匹配的字符串并转换为小写
+            result = match.group(1).lower()
+            return result
+    return ''
+
+# 保存抓取到的原始HTML，方便后续核验
+def write_raw_html(href, html_text):
+    # 获取目录
+    id = extract_id_from_href(href)
+    if 'javdb.com/v/' in href.lower():
+        dir_prefix = 'raw_movies'
+    else:
+        return 
+    
+    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
+    file_name = f"{id}.html"  # 用 - 替换空格
+    full_path = os.path.join(file_dir, file_name)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as file:
+            file.write(html_text)
+    except FileNotFoundError:
+        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
+    except PermissionError:
+        logging.warning(f"错误：没有权限写入文件 {full_path}。")
+    except Exception as e:
+        logging.warning(f"发生未知错误：{e}")
+
+
+# 保存抓取到的原始HTML，方便后续核验
+def read_raw_html(href, expire_date_str="2025-03-01"):
+    # 获取目录
+    id = extract_id_from_href(href)
+    if 'javdb.com/v/' in href.lower():
+        dir_prefix = 'raw_movies'
+    else:
+        return 
+
+    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
+    file_name = f"{id}.html"  # 用 - 替换空格
+    full_path = os.path.join(file_dir, file_name)
+
+    try:
+        if os.path.exists(full_path):
+            # 获取文件的最后修改时间
+            last_modified_timestamp = os.path.getmtime(full_path)
+            # 将时间戳转换为 datetime 对象
+            last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
+            # 检查文件最后修改时间是否晚于给定日期
+            expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
+            if last_modified_date > expire_date:
+                logging.debug(f"find local file on href {href}")
+                with open(full_path, 'r', encoding='utf-8') as file:
+                    return file.read()
+            else:
+                logging.debug(f"expired file {last_modified_date} on href {href}")
+                return None
+        else:
+            return None
+    except FileNotFoundError:
+        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
+    except PermissionError:
+        logging.warning(f"错误：没有权限读取文件 {full_path}。")
+    except Exception as e:
+        logging.warning(f"发生未知错误：{e}")
+    return None
+
+
+
+# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
+def remove_url_query(url: str) -> str:
+    try:
+        parsed_url = urlparse(url)
+        clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
+        return clean_url
+    except Exception as e:
+        print(f"解析 URL 失败: {e}")
+        return url
+# 写csv文件
+def json_to_csv(data, output_file):
+    if not data:
+        return
+    headers = list(data[0].keys())
+    with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=headers)
+        writer.writeheader()
+        for row in data:
+            writer.writerow(row)
+
+
+
+def normalize_url(url: str) -> str:
+    """
+    标准化URL，移除语言前缀，使不同语言版本的URL保持一致
+    
+    示例:
+    https://www.javbus.com/ja/star/p8y → https://www.javbus.com/star/p8y
+    https://www.javbus.com/en/star/p8y → https://www.javbus.com/star/p8y
+    """
+    try:
+        # 解析URL
+        parsed = urlparse(url)
+        
+        # 提取路径部分
+        path = parsed.path
+        
+        # 常见语言代码列表
+        LANGUAGES = {'ja', 'en', 'ko', 'zh', 'fr', 'de', 'es', 'ru'}
+        
+        # 分割路径为组件
+        path_components = path.strip('/').split('/')
+        
+        # 如果第一个组件是语言代码，则移除它
+        if path_components and path_components[0] in LANGUAGES:
+            path_components = path_components[1:]
+        
+        # 重新构建标准化的路径
+        normalized_path = '/' + '/'.join(path_components)
+        
+        # 构建标准化的URL（保留协议和域名，替换路径）
+        normalized_url = parsed._replace(path=normalized_path).geturl()
+        
+        return normalized_url
+    
+    except Exception as e:
+        print(f"URL标准化失败: {url}, 错误: {e}")
+        return url  # 出错时返回原始URL