modify scripts
This commit is contained in:
@ -148,10 +148,15 @@ def fetch_movies_by_series():
|
|||||||
|
|
||||||
# 更新演员信息
|
# 更新演员信息
|
||||||
def fetch_performers_detail():
|
def fetch_performers_detail():
|
||||||
|
limit_count = 5 if debug else 100
|
||||||
perfomers_list = []
|
perfomers_list = []
|
||||||
|
last_perfomer_id = 0
|
||||||
while True:
|
while True:
|
||||||
# 每次从数据库中取一部分,避免一次全量获取
|
# 每次从数据库中取一部分,避免一次全量获取
|
||||||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=100)
|
if force: # 从头逐个遍历
|
||||||
|
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||||
|
else: # 只做更新
|
||||||
|
perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
|
||||||
if len(perfomers_list) < 1:
|
if len(perfomers_list) < 1:
|
||||||
logging.info(f'all performers fetched.')
|
logging.info(f'all performers fetched.')
|
||||||
break
|
break
|
||||||
@ -189,21 +194,27 @@ def fetch_performers_detail():
|
|||||||
})
|
})
|
||||||
if performer_id:
|
if performer_id:
|
||||||
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||||||
|
last_perfomer_id = performer_id
|
||||||
else:
|
else:
|
||||||
logging.warning(f'insert person: ({person}) {url} failed.')
|
logging.warning(f'insert person: ({person}) {url} failed.')
|
||||||
|
time.sleep(0.5)
|
||||||
# 调试break
|
# 调试break
|
||||||
if debug:
|
if debug:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# 更新影片信息
|
# 更新影片信息
|
||||||
def fetch_movies_detail():
|
def fetch_movies_detail():
|
||||||
|
limit_count = 10 if debug else 100
|
||||||
movies_list = []
|
movies_list = []
|
||||||
|
last_movie_id = 0
|
||||||
while True:
|
while True:
|
||||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100)
|
if force: # 从头逐个遍历
|
||||||
|
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||||
|
else: # 只做更新
|
||||||
|
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
||||||
if len(movies_list) < 1:
|
if len(movies_list) < 1:
|
||||||
logging.info(f'all movies fetched.')
|
logging.info(f'all movies fetched.')
|
||||||
break
|
break
|
||||||
last_movie_id = 0
|
|
||||||
succ_count = 0
|
succ_count = 0
|
||||||
for movie in movies_list:
|
for movie in movies_list:
|
||||||
url = movie['href']
|
url = movie['href']
|
||||||
@ -231,7 +242,7 @@ def fetch_movies_detail():
|
|||||||
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. url: {url}')
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
time.sleep(1)
|
time.sleep(0.5)
|
||||||
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
|
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
|
||||||
# 调试增加break
|
# 调试增加break
|
||||||
if debug:
|
if debug:
|
||||||
@ -253,6 +264,9 @@ function_map = {
|
|||||||
def main(cmd, args_debug, args_force):
|
def main(cmd, args_debug, args_force):
|
||||||
global debug
|
global debug
|
||||||
debug = args_debug
|
debug = args_debug
|
||||||
|
if debug:
|
||||||
|
logger = logging.getLogger()
|
||||||
|
#logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
global force
|
global force
|
||||||
force = args_force
|
force = args_force
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
|
|||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import config
|
import config
|
||||||
|
import utils
|
||||||
|
|
||||||
# 定义基础 URL 和可变参数
|
# 定义基础 URL 和可变参数
|
||||||
host_url = "https://www.javdb.com"
|
host_url = "https://www.javdb.com"
|
||||||
@ -24,8 +25,22 @@ headers = {
|
|||||||
}
|
}
|
||||||
scraper = cloudscraper.create_scraper()
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
|
save_raw_html = True
|
||||||
|
load_from_local = True
|
||||||
|
|
||||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||||
|
if load_from_local: # 从本地读取的逻辑
|
||||||
|
html = utils.read_raw_html(url)
|
||||||
|
if html:
|
||||||
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
|
html_text = preprocessor(html) if preprocessor else html
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_text, parser)
|
||||||
|
if validator(soup): # 进行自定义页面检查
|
||||||
|
logging.info(f"read from local. href: {url}")
|
||||||
|
return soup, 99 # 返回一个小于100的错误码,表明是从本地返回的
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
if 'javdb.com' not in url.lower():
|
if 'javdb.com' not in url.lower():
|
||||||
@ -50,6 +65,9 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
logging.warning(f"Page redirected to login page on {url}.")
|
logging.warning(f"Page redirected to login page on {url}.")
|
||||||
return None, 401
|
return None, 401
|
||||||
|
|
||||||
|
if save_raw_html:
|
||||||
|
utils.write_raw_html(url, response.text)
|
||||||
|
|
||||||
# 预处理 HTML(如果提供了 preprocessor)
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||||
|
|
||||||
@ -223,7 +241,7 @@ def parse_actor_detail(soup, href):
|
|||||||
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
# 解析 HTML 内容,提取需要的数据
|
||||||
def parse_movie_detail(soup, href, title):
|
def parse_movie_detail_old(soup, href, title):
|
||||||
div_video = soup.find("div", class_='video-meta-panel')
|
div_video = soup.find("div", class_='video-meta-panel')
|
||||||
if not div_video:
|
if not div_video:
|
||||||
logging.warning(f"Warning: No movies div found ")
|
logging.warning(f"Warning: No movies div found ")
|
||||||
@ -272,6 +290,74 @@ def parse_movie_detail(soup, href, title):
|
|||||||
'actors': actors
|
'actors': actors
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 解析单个元素
|
||||||
|
def parse_movie_one(soup, keys):
|
||||||
|
key_strong = soup.find('strong', string=lambda text: text in keys)
|
||||||
|
if key_strong:
|
||||||
|
key_span = key_strong.find_next_sibling('span', class_='value')
|
||||||
|
if key_span:
|
||||||
|
return key_span.text.strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 解析值和链接
|
||||||
|
def parse_movie_val_href(soup, keys):
|
||||||
|
key_strong = soup.find('strong', string=lambda text: text in keys)
|
||||||
|
if key_strong:
|
||||||
|
key_span = key_strong.find_next_sibling('span', class_='value')
|
||||||
|
if key_span:
|
||||||
|
a_tag = key_span.find('a')
|
||||||
|
if a_tag:
|
||||||
|
return a_tag.text.strip(), host_url + a_tag.get('href')
|
||||||
|
else:
|
||||||
|
return key_span.text.strip(), None
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析多个值和链接
|
||||||
|
def parse_movie_arr(soup, keys):
|
||||||
|
key_strong = soup.find('strong', string=lambda text: text in keys)
|
||||||
|
if key_strong:
|
||||||
|
key_span = key_strong.find_next_sibling('span', class_='value')
|
||||||
|
if key_span:
|
||||||
|
actors = []
|
||||||
|
a_tags = key_span.find_all('a')
|
||||||
|
for a_tag in a_tags:
|
||||||
|
actors.append({
|
||||||
|
'name': a_tag.text.strip(),
|
||||||
|
'href': host_url + a_tag.get('href')
|
||||||
|
})
|
||||||
|
return actors
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_movie_detail(soup, href, title):
|
||||||
|
div_video = soup.find("div", class_='video-meta-panel')
|
||||||
|
if not div_video:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
result['href'] = href
|
||||||
|
result['title'] = title
|
||||||
|
|
||||||
|
# 获取封面图片
|
||||||
|
cover_img = soup.select_one('.column-video-cover a')
|
||||||
|
result['cover_url'] = cover_img['href'] if cover_img else None
|
||||||
|
|
||||||
|
# 获取番号
|
||||||
|
result['serial_number'] = parse_movie_one(soup, ['番號:', 'ID:'])
|
||||||
|
result['release_date'] = parse_movie_one(soup, ['日期:', 'Released Date:'])
|
||||||
|
result['duration'] = parse_movie_one(soup, ['時長:', 'Duration:'])
|
||||||
|
|
||||||
|
# 获取maker,系列
|
||||||
|
result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:'])
|
||||||
|
result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:'])
|
||||||
|
|
||||||
|
# 获取演员,tags
|
||||||
|
result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:'])
|
||||||
|
result['actors'] = parse_movie_arr(soup, ['演員:', 'Actor(s):'])
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
# 解析 HTML 内容,提取需要的数据
|
||||||
def parse_series_uncensored(soup, href):
|
def parse_series_uncensored(soup, href):
|
||||||
div_series = soup.find("div", id='series')
|
div_series = soup.find("div", id='series')
|
||||||
|
|||||||
@ -49,7 +49,7 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
|
|||||||
|
|
||||||
performer_id = get_id_by_href('javdb_actors', href)
|
performer_id = get_id_by_href('javdb_actors', href)
|
||||||
if performer_id:
|
if performer_id:
|
||||||
logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}')
|
logging.debug(f"Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}")
|
||||||
|
|
||||||
return performer_id
|
return performer_id
|
||||||
|
|
||||||
@ -200,6 +200,33 @@ def query_actors(**filters):
|
|||||||
if "is_full_data" in filters:
|
if "is_full_data" in filters:
|
||||||
sql += " AND is_full_data = ?"
|
sql += " AND is_full_data = ?"
|
||||||
params.append(filters["is_full_data"])
|
params.append(filters["is_full_data"])
|
||||||
|
if "from_actor_list" in filters:
|
||||||
|
sql += " AND from_actor_list = ?"
|
||||||
|
params.append(filters["from_actor_list"])
|
||||||
|
if "is_full_data_in" in filters:
|
||||||
|
values = filters["is_full_data_in"]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
sql += f" AND is_full_data IN ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
if "is_full_data_not_in" in filters:
|
||||||
|
values = filters["is_full_data_not_in"]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
sql += f" AND is_full_data NOT IN ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
if "before_updated_at" in filters:
|
||||||
|
sql += " AND updated_at <= ?"
|
||||||
|
params.append(filters["before_updated_at"])
|
||||||
|
if "after_updated_at" in filters:
|
||||||
|
sql += " AND updated_at >= ?"
|
||||||
|
params.append(filters["after_updated_at"])
|
||||||
|
if "start_id" in filters:
|
||||||
|
sql += " AND id > ?"
|
||||||
|
params.append(filters["start_id"])
|
||||||
|
if "order_by" in filters:
|
||||||
|
sql += " order by ? asc"
|
||||||
|
params.append(filters["order_by"])
|
||||||
if 'limit' in filters:
|
if 'limit' in filters:
|
||||||
sql += " limit ?"
|
sql += " limit ?"
|
||||||
params.append(filters["limit"])
|
params.append(filters["limit"])
|
||||||
@ -372,13 +399,43 @@ def query_series_hrefs(**filters):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# 插入或更新类别 """
|
||||||
|
def insert_or_update_tags(name, href):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO javdb_tags (name, href, updated_at)
|
||||||
|
VALUES (?, ? , datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
name = excluded.name,
|
||||||
|
updated_at = datetime('now', 'localtime')
|
||||||
|
""", (name, href))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 获取 performer_id
|
||||||
|
cursor.execute("SELECT id FROM javdb_tags WHERE href = ?", (href,))
|
||||||
|
dist_id = cursor.fetchone()[0]
|
||||||
|
if dist_id:
|
||||||
|
logging.debug(f"insert/update tags succ. id: {dist_id}, name: {name}")
|
||||||
|
return dist_id
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"数据库错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
# """插入或更新电影数据"""
|
# """插入或更新电影数据"""
|
||||||
def insert_or_update_movie(movie):
|
def insert_or_update_movie(movie):
|
||||||
try:
|
try:
|
||||||
# 获取相关 ID
|
# 获取相关 ID
|
||||||
makers_id = get_id_by_href('javdb_makers', movie['maker_link'])
|
makers_id = get_id_by_href('javdb_makers', movie['maker_link']) if movie['maker_link'] else None
|
||||||
series_id = get_id_by_href('javdb_series', movie['series_link'])
|
series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None
|
||||||
|
|
||||||
|
# 如果不存在,插入
|
||||||
|
if makers_id is None and movie['maker_link']:
|
||||||
|
makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')})
|
||||||
|
if series_id is None and movie['series_link']:
|
||||||
|
series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')})
|
||||||
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
|
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
|
||||||
@ -404,7 +461,7 @@ def insert_or_update_movie(movie):
|
|||||||
if movie_id is None:
|
if movie_id is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
logging.debug(f'insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}')
|
logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}")
|
||||||
|
|
||||||
# 插入 performers_movies 关系表
|
# 插入 performers_movies 关系表
|
||||||
for performer in movie.get('actors', []):
|
for performer in movie.get('actors', []):
|
||||||
@ -412,14 +469,23 @@ def insert_or_update_movie(movie):
|
|||||||
# 如果演员不存在,先插入
|
# 如果演员不存在,先插入
|
||||||
if performer_id is None:
|
if performer_id is None:
|
||||||
performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
|
performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
|
||||||
|
logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}")
|
||||||
if performer_id:
|
if performer_id:
|
||||||
tmp_id = insert_actor_movie(performer_id, movie_id)
|
tmp_id = insert_actor_movie(performer_id, movie_id)
|
||||||
if tmp_id:
|
if tmp_id:
|
||||||
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
|
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
|
||||||
else:
|
else:
|
||||||
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
|
logging.debug(f"insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}")
|
||||||
else:
|
else:
|
||||||
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
|
logging.warning(f"insert perfomer failed. name: {performer['name']}, href: {performer['href']}")
|
||||||
|
|
||||||
|
# 插入 tags 表
|
||||||
|
for tag in movie.get('tags', []):
|
||||||
|
tag_name = tag.get('name', '')
|
||||||
|
tag_href = tag.get('href', '')
|
||||||
|
tag_id = insert_or_update_tags(tag_name, tag_href)
|
||||||
|
if tag_id:
|
||||||
|
logging.debug(f"insert one tags. tag_id: {tag_id}, name:{tag_name}")
|
||||||
|
|
||||||
return movie_id
|
return movie_id
|
||||||
|
|
||||||
@ -516,6 +582,33 @@ def query_movie_hrefs(**filters):
|
|||||||
if "is_full_data" in filters:
|
if "is_full_data" in filters:
|
||||||
sql += " AND is_full_data = ?"
|
sql += " AND is_full_data = ?"
|
||||||
params.append(filters["is_full_data"])
|
params.append(filters["is_full_data"])
|
||||||
|
if "from_actor_list" in filters:
|
||||||
|
sql += " AND from_actor_list = ?"
|
||||||
|
params.append(filters["from_actor_list"])
|
||||||
|
if "is_full_data_in" in filters:
|
||||||
|
values = filters["is_full_data_in"]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
sql += f" AND is_full_data IN ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
if "is_full_data_not_in" in filters:
|
||||||
|
values = filters["is_full_data_not_in"]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
sql += f" AND is_full_data NOT IN ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
if "before_updated_at" in filters:
|
||||||
|
sql += " AND updated_at <= ?"
|
||||||
|
params.append(filters["before_updated_at"])
|
||||||
|
if "after_updated_at" in filters:
|
||||||
|
sql += " AND updated_at >= ?"
|
||||||
|
params.append(filters["after_updated_at"])
|
||||||
|
if "start_id" in filters:
|
||||||
|
sql += " AND id > ?"
|
||||||
|
params.append(filters["start_id"])
|
||||||
|
if "order_by" in filters:
|
||||||
|
sql += " order by ?"
|
||||||
|
params.append(filters["order_by"])
|
||||||
if 'limit' in filters:
|
if 'limit' in filters:
|
||||||
sql += " limit ?"
|
sql += " limit ?"
|
||||||
params.append(filters["limit"])
|
params.append(filters["limit"])
|
||||||
|
|||||||
@ -3,8 +3,98 @@ import os
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import csv
|
import csv
|
||||||
|
from datetime import datetime
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import logging
|
import logging
|
||||||
|
import config
|
||||||
|
|
||||||
|
update_dir = f'{config.global_host_data_dir}/javdb'
|
||||||
|
|
||||||
|
# 创建目录
|
||||||
|
def create_sub_directory(base_dir, str):
|
||||||
|
# 获取 person 的前两个字母并转为小写
|
||||||
|
sub_dir = str[:1].lower()
|
||||||
|
full_path = os.path.join(base_dir, sub_dir)
|
||||||
|
if not os.path.exists(full_path):
|
||||||
|
os.makedirs(full_path)
|
||||||
|
return full_path
|
||||||
|
|
||||||
|
# 只提取movies url
|
||||||
|
def extract_id_from_href(href):
|
||||||
|
# 检查 URL 是否符合要求
|
||||||
|
if 'javdb.com/v/' in href:
|
||||||
|
# 定义正则表达式模式
|
||||||
|
pattern = r'javdb.com/v/([^?&]+)'
|
||||||
|
# 查找匹配项
|
||||||
|
match = re.search(pattern, href)
|
||||||
|
if match:
|
||||||
|
# 提取匹配的字符串并转换为小写
|
||||||
|
result = match.group(1).lower()
|
||||||
|
return result
|
||||||
|
return ''
|
||||||
|
|
||||||
|
# 保存抓取到的原始HTML,方便后续核验
|
||||||
|
def write_raw_html(href, html_text):
|
||||||
|
# 获取目录
|
||||||
|
id = extract_id_from_href(href)
|
||||||
|
if 'javdb.com/v/' in href.lower():
|
||||||
|
dir_prefix = 'raw_movies'
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||||||
|
file_name = f"{id}.html" # 用 - 替换空格
|
||||||
|
full_path = os.path.join(file_dir, file_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(full_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(html_text)
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||||||
|
except PermissionError:
|
||||||
|
logging.warning(f"错误:没有权限写入文件 {full_path}。")
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"发生未知错误:{e}")
|
||||||
|
|
||||||
|
|
||||||
|
# 保存抓取到的原始HTML,方便后续核验
|
||||||
|
def read_raw_html(href, expire_date_str="2025-03-01"):
|
||||||
|
# 获取目录
|
||||||
|
id = extract_id_from_href(href)
|
||||||
|
if 'javdb.com/v/' in href.lower():
|
||||||
|
dir_prefix = 'raw_movies'
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||||||
|
file_name = f"{id}.html" # 用 - 替换空格
|
||||||
|
full_path = os.path.join(file_dir, file_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if os.path.exists(full_path):
|
||||||
|
# 获取文件的最后修改时间
|
||||||
|
last_modified_timestamp = os.path.getmtime(full_path)
|
||||||
|
# 将时间戳转换为 datetime 对象
|
||||||
|
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
|
||||||
|
# 检查文件最后修改时间是否晚于给定日期
|
||||||
|
expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
|
||||||
|
if last_modified_date > expire_date:
|
||||||
|
logging.debug(f"find local file on href {href}")
|
||||||
|
with open(full_path, 'r', encoding='utf-8') as file:
|
||||||
|
return file.read()
|
||||||
|
else:
|
||||||
|
logging.debug(f"expired file {last_modified_date} on href {href}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||||||
|
except PermissionError:
|
||||||
|
logging.warning(f"错误:没有权限读取文件 {full_path}。")
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"发生未知错误:{e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
|
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
|
||||||
|
|||||||
Reference in New Issue
Block a user