Files
stock/scripts/iafd/merge/url_match.py
2025-03-02 15:27:53 +08:00

120 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import logging
import cloudscraper
import time
from requests.exceptions import RequestException
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
test_flag = True
# 读取stashdb.json
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
return json.load(file)
except FileNotFoundError:
logger.error(f"File {file_path} not found.")
return []
except json.JSONDecodeError:
logger.error(f"Error decoding JSON from {file_path}.")
return []
# 请求URL并获取重定向后的URL
def fetch_real_url_2(url, scraper):
try:
response = scraper.get(url, allow_redirects=True)
if response.status_code == 200:
return response.url # 获取最终的URL
else:
logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None
except RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
def fetch_real_url(url, scraper):
try:
# 请求URL禁止自动重定向
response = scraper.get(url, allow_redirects=False)
# 检查是否是302响应并获取Location头部的URL
if response.status_code == 302 or response.status_code == 301:
redirect_url = response.headers.get("Location")
if redirect_url:
logger.info(f"Redirected to: {redirect_url}")
return redirect_url
else:
logger.warning(f"Redirect response received, but no Location header found for {url}")
return None
else:
logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None
except RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
# 处理每个 URL
def process_urls(data, scraper):
loop = 0
global test_flag
for entry in data:
iafd_urls = entry.get('iafd_urls', [])
real_urls = []
for url in iafd_urls:
if 'perfid=' in url:
# 如果是重定向链接访问并获取重定向后的URL
real_url = fetch_real_url(url, scraper)
if real_url:
real_urls.append(real_url)
# 测试时,执行小批量数据
loop = loop + 1
if test_flag and loop >10:
return data
elif 'person.rme/id=' in url:
# 非perfid链接直接添加
real_urls.append(url)
else:
# 非perfid链接直接添加
real_urls.append(url)
logger.warning(f"unkown url format: {url}")
# 更新iafd_real_url字段
entry['iafd_real_url'] = real_urls
return data
# 保存处理后的结果到 result.json
def save_to_json(data, output_file):
try:
with open(output_file, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
logger.info(f"Data saved to {output_file}")
except Exception as e:
logger.error(f"Error saving to {output_file}: {e}")
# 主函数
def main():
# 读取输入文件
input_file = 'stashdb.json'
output_file = 'result.json'
# 创建cloudscraper对象
scraper = cloudscraper.create_scraper()
# 读取stashdb.json中的数据
data = read_json(input_file)
# 处理每个 URL获取重定向后的URL
processed_data = process_urls(data, scraper)
# 保存结果到 result.json
save_to_json(processed_data, output_file)
if __name__ == "__main__":
main()