modify scripts

This commit is contained in:
oscarz
2025-06-27 09:20:47 +08:00
parent d91ba1cd17
commit ac7cff9454
4 changed files with 39 additions and 39 deletions

View File

@ -2,11 +2,13 @@ import logging
import sys import sys
import requests import requests
import re import re
import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
import src.utils.utils as utils import src.utils.utils as utils
http_code_404 = 404 http_code_404 = 404
http_code_403 = 403
http_code_redirect = 401 http_code_redirect = 401
http_code_url = 601 http_code_url = 601
http_code_local = 99 http_code_local = 99
@ -59,9 +61,9 @@ class GenericCrawler:
response = self.scraper.get(url, headers=self.headers, cookies=self.cookies) response = self.scraper.get(url, headers=self.headers, cookies=self.cookies)
# 处理 HTTP 状态码 # 处理 HTTP 状态码
if response.status_code == http_code_404: if response.status_code in [http_code_404, http_code_403]:
logging.debug(f"Page not found (404): {url}") logging.debug(f"get http code: {response.status_code}, url: {url}")
return None, http_code_404 # 直接返回 404,调用方可以跳过 return None, response.status_code # 直接返回,调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误 response.raise_for_status() # 处理 HTTP 错误
@ -86,6 +88,7 @@ class GenericCrawler:
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except Exception as e: except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retrying...") logging.error(f"Unexpected error on {url}: {e}, Retrying...")
time.sleep(0.3)
logging.error(f'Fetching failed after max retries. {url}') logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败 return None, None # 达到最大重试次数仍然失败

View File

@ -634,7 +634,7 @@ class JavbusDBHandler(DatabaseHandler):
return result return result
# 处理影片的 无码 字段 # 处理影片的 无码 字段
def reset_movies_uncensored(self): def reset_movies_uncensored(self, check_and_do = 0):
try: try:
logging.info("创建临时表以便于保存待更新记录") logging.info("创建临时表以便于保存待更新记录")
self.cursor.execute(""" self.cursor.execute("""
@ -669,24 +669,26 @@ class JavbusDBHandler(DatabaseHandler):
total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0] total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0]
logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片") logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片")
# 1. 将所有记录的uncensored默认设为0 if check_and_do:
logging.info("开始将所有影片的uncensored设为默认值0...") # 1. 将所有记录的uncensored默认设为0
self.cursor.execute("UPDATE javbus_movies SET uncensored = 0") logging.info("开始将所有影片的uncensored设为默认值0...")
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为0") self.cursor.execute("UPDATE javbus_movies SET uncensored = 0")
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为0")
# 2. 将临时表中匹配的记录设为1 # 2. 将临时表中匹配的记录设为1
logging.info("开始将匹配的影片的uncensored设为1...") logging.info("开始将匹配的影片的uncensored设为1...")
self.cursor.execute(""" self.cursor.execute("""
UPDATE javbus_movies UPDATE javbus_movies
SET uncensored = 1 SET uncensored = 1
WHERE id IN (SELECT movie_id FROM temp_movies_to_update) WHERE id IN (SELECT movie_id FROM temp_movies_to_update)
""") """)
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1") logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1")
# 3. 清理临时表,也可以不清理,以便于抽检 self.conn.commit()
else:
logging.info("check完毕本次忽略更新。。。")
self.conn.commit() logging.info("任务执行完成!")
logging.info("所有更新已提交")
except sqlite3.Error as e: except sqlite3.Error as e:
self.conn.rollback() self.conn.rollback()

View File

@ -315,7 +315,8 @@ def fetch_performers_detail():
uncensored = int(performer['uncensored']) uncensored = int(performer['uncensored'])
avatar = None avatar = None
if not utils.is_valid_url(url): if not utils.is_valid_url(url):
logging.info(f'invalid url ({url}), name: {person}. skipping...') actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404})
logging.info(f'invalid url ({url}), id: {actor_id}, name: {person}. skipping...')
continue continue
next_url = url next_url = url
@ -331,14 +332,9 @@ def fetch_performers_detail():
avatar = data.get('avatar') avatar = data.get('avatar')
all_movies.extend(data.get('movies', [])) all_movies.extend(data.get('movies', []))
elif status_code and status_code == craw.http_code_404: elif status_code and status_code in [craw.http_code_404, craw.http_code_403, craw.http_code_redirect]:
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404}) actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': status_code})
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...') logging.warning(f'get page http code {status_code}. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
elif status_code and status_code == craw.http_code_redirect:
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_redirect})
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False need_insert = False
break break
else: else:
@ -414,6 +410,7 @@ def fetch_movies_detail():
curr_id = movie['id'] curr_id = movie['id']
uncensored = int(movie['uncensored']) uncensored = int(movie['uncensored'])
if not utils.is_valid_url(url): if not utils.is_valid_url(url):
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
logging.info(f'invalid url ({url}), row id: {curr_id}. skipping...') logging.info(f'invalid url ({url}), row id: {curr_id}. skipping...')
continue continue
@ -442,12 +439,9 @@ def fetch_movies_detail():
logging.warning(f'parse_page_movie error. url: {url}') logging.warning(f'parse_page_movie error. url: {url}')
time.sleep(2) time.sleep(2)
elif status_code and status_code == craw.http_code_404: elif status_code and status_code in [craw.http_code_404, craw.http_code_403, craw.http_code_redirect]:
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404}) movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': status_code})
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') logging.warning(f'get page http code {status_code}. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
elif status_code and status_code == craw.http_code_redirect:
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_redirect})
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else: else:
logging.warning(f'fetch_page error. url: {url}') logging.warning(f'fetch_page error. url: {url}')
time.sleep(0.5) time.sleep(0.5)
@ -456,6 +450,9 @@ def fetch_movies_detail():
if debug: if debug:
return True return True
# 重置 movies 表的 uncensored 标志位
def reset_movies_uncensored():
db_tools.reset_movies_uncensored(check_and_do=0 if debug else 1)
# 建立缩写到函数的映射 # 建立缩写到函数的映射
function_map = { function_map = {
@ -467,6 +464,7 @@ function_map = {
"movies" : fetch_movies_detail, "movies" : fetch_movies_detail,
"langs" : update_multi_langs, "langs" : update_multi_langs,
"tags" : update_multilang_tags, "tags" : update_multilang_tags,
"reset_un" : reset_movies_uncensored
} }
# 主函数 # 主函数

View File

@ -2,9 +2,7 @@ import json
import time import time
import src.db_utils.sqlite_db as sqlite_db import src.db_utils.sqlite_db as sqlite_db
import src.utils.utils as utils import src.utils.utils as utils
import src.logger.logger as logger
logger.setup_logging()
db_tools = sqlite_db.JavbusDBHandler() db_tools = sqlite_db.JavbusDBHandler()
if __name__ == "__main__": if __name__ == "__main__":
@ -12,5 +10,4 @@ if __name__ == "__main__":
result = db_tools.get_statics() result = db_tools.get_statics()
utils.pretty_print_json(result) utils.pretty_print_json(result)
db_tools.reset_movies_uncensored()