119 lines
3.9 KiB
Python
119 lines
3.9 KiB
Python
import re
|
||
import os
|
||
import json
|
||
import time
|
||
import csv
|
||
from datetime import datetime
|
||
from urllib.parse import urlparse
|
||
import logging
|
||
import config
|
||
|
||
update_dir = f'{config.global_host_data_dir}/javdb'
|
||
|
||
# 创建目录
|
||
def create_sub_directory(base_dir, str):
|
||
# 获取 person 的前两个字母并转为小写
|
||
sub_dir = str[:1].lower()
|
||
full_path = os.path.join(base_dir, sub_dir)
|
||
if not os.path.exists(full_path):
|
||
os.makedirs(full_path)
|
||
return full_path
|
||
|
||
# 只提取movies url
|
||
def extract_id_from_href(href):
|
||
# 检查 URL 是否符合要求
|
||
if 'javdb.com/v/' in href:
|
||
# 定义正则表达式模式
|
||
pattern = r'javdb.com/v/([^?&]+)'
|
||
# 查找匹配项
|
||
match = re.search(pattern, href)
|
||
if match:
|
||
# 提取匹配的字符串并转换为小写
|
||
result = match.group(1).lower()
|
||
return result
|
||
return ''
|
||
|
||
# 保存抓取到的原始HTML,方便后续核验
|
||
def write_raw_html(href, html_text):
|
||
# 获取目录
|
||
id = extract_id_from_href(href)
|
||
if 'javdb.com/v/' in href.lower():
|
||
dir_prefix = 'raw_movies'
|
||
else:
|
||
return
|
||
|
||
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||
file_name = f"{id}.html" # 用 - 替换空格
|
||
full_path = os.path.join(file_dir, file_name)
|
||
|
||
try:
|
||
with open(full_path, 'w', encoding='utf-8') as file:
|
||
file.write(html_text)
|
||
except FileNotFoundError:
|
||
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||
except PermissionError:
|
||
logging.warning(f"错误:没有权限写入文件 {full_path}。")
|
||
except Exception as e:
|
||
logging.warning(f"发生未知错误:{e}")
|
||
|
||
|
||
# 保存抓取到的原始HTML,方便后续核验
|
||
def read_raw_html(href, expire_date_str="2025-03-01"):
|
||
# 获取目录
|
||
id = extract_id_from_href(href)
|
||
if 'javdb.com/v/' in href.lower():
|
||
dir_prefix = 'raw_movies'
|
||
else:
|
||
return
|
||
|
||
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||
file_name = f"{id}.html" # 用 - 替换空格
|
||
full_path = os.path.join(file_dir, file_name)
|
||
|
||
try:
|
||
if os.path.exists(full_path):
|
||
# 获取文件的最后修改时间
|
||
last_modified_timestamp = os.path.getmtime(full_path)
|
||
# 将时间戳转换为 datetime 对象
|
||
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
|
||
# 检查文件最后修改时间是否晚于给定日期
|
||
expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
|
||
if last_modified_date > expire_date:
|
||
logging.debug(f"find local file on href {href}")
|
||
with open(full_path, 'r', encoding='utf-8') as file:
|
||
return file.read()
|
||
else:
|
||
logging.debug(f"expired file {last_modified_date} on href {href}")
|
||
return None
|
||
else:
|
||
return None
|
||
except FileNotFoundError:
|
||
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||
except PermissionError:
|
||
logging.warning(f"错误:没有权限读取文件 {full_path}。")
|
||
except Exception as e:
|
||
logging.warning(f"发生未知错误:{e}")
|
||
return None
|
||
|
||
|
||
|
||
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
|
||
def remove_url_query(url: str) -> str:
|
||
try:
|
||
parsed_url = urlparse(url)
|
||
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
||
return clean_url
|
||
except Exception as e:
|
||
print(f"解析 URL 失败: {e}")
|
||
return url
|
||
# 写csv文件
|
||
def json_to_csv(data, output_file):
|
||
if not data:
|
||
return
|
||
headers = list(data[0].keys())
|
||
with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile:
|
||
writer = csv.DictWriter(csvfile, fieldnames=headers)
|
||
writer.writeheader()
|
||
for row in data:
|
||
writer.writerow(row)
|