modify scripts

This commit is contained in:
oscarz
2025-06-27 09:20:47 +08:00
parent d91ba1cd17
commit ac7cff9454
4 changed files with 39 additions and 39 deletions

View File

@ -2,11 +2,13 @@ import logging
import sys import sys
import requests import requests
import re import re
import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
import src.utils.utils as utils import src.utils.utils as utils
http_code_404 = 404 http_code_404 = 404
http_code_403 = 403
http_code_redirect = 401 http_code_redirect = 401
http_code_url = 601 http_code_url = 601
http_code_local = 99 http_code_local = 99
@ -59,9 +61,9 @@ class GenericCrawler:
response = self.scraper.get(url, headers=self.headers, cookies=self.cookies) response = self.scraper.get(url, headers=self.headers, cookies=self.cookies)
# 处理 HTTP 状态码 # 处理 HTTP 状态码
if response.status_code == http_code_404: if response.status_code in [http_code_404, http_code_403]:
logging.debug(f"Page not found (404): {url}") logging.debug(f"get http code: {response.status_code}, url: {url}")
return None, http_code_404 # 直接返回 404,调用方可以跳过 return None, response.status_code # 直接返回,调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误 response.raise_for_status() # 处理 HTTP 错误
@ -86,6 +88,7 @@ class GenericCrawler:
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except Exception as e: except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retrying...") logging.error(f"Unexpected error on {url}: {e}, Retrying...")
time.sleep(0.3)
logging.error(f'Fetching failed after max retries. {url}') logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败 return None, None # 达到最大重试次数仍然失败

View File

@ -634,7 +634,7 @@ class JavbusDBHandler(DatabaseHandler):
return result return result
# 处理影片的 无码 字段 # 处理影片的 无码 字段
def reset_movies_uncensored(self): def reset_movies_uncensored(self, check_and_do = 0):
try: try:
logging.info("创建临时表以便于保存待更新记录") logging.info("创建临时表以便于保存待更新记录")
self.cursor.execute(""" self.cursor.execute("""
@ -669,6 +669,7 @@ class JavbusDBHandler(DatabaseHandler):
total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0] total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0]
logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片") logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片")
if check_and_do:
# 1. 将所有记录的uncensored默认设为0 # 1. 将所有记录的uncensored默认设为0
logging.info("开始将所有影片的uncensored设为默认值0...") logging.info("开始将所有影片的uncensored设为默认值0...")
self.cursor.execute("UPDATE javbus_movies SET uncensored = 0") self.cursor.execute("UPDATE javbus_movies SET uncensored = 0")
@ -683,10 +684,11 @@ class JavbusDBHandler(DatabaseHandler):
""") """)
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1") logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1")
# 3. 清理临时表,也可以不清理,以便于抽检
self.conn.commit() self.conn.commit()
logging.info("所有更新已提交") else:
logging.info("check完毕本次忽略更新。。。")
logging.info("任务执行完成!")
except sqlite3.Error as e: except sqlite3.Error as e:
self.conn.rollback() self.conn.rollback()

View File

@ -315,7 +315,8 @@ def fetch_performers_detail():
uncensored = int(performer['uncensored']) uncensored = int(performer['uncensored'])
avatar = None avatar = None
if not utils.is_valid_url(url): if not utils.is_valid_url(url):
logging.info(f'invalid url ({url}), name: {person}. skipping...') actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404})
logging.info(f'invalid url ({url}), id: {actor_id}, name: {person}. skipping...')
continue continue
next_url = url next_url = url
@ -331,14 +332,9 @@ def fetch_performers_detail():
avatar = data.get('avatar') avatar = data.get('avatar')
all_movies.extend(data.get('movies', [])) all_movies.extend(data.get('movies', []))
elif status_code and status_code == craw.http_code_404: elif status_code and status_code in [craw.http_code_404, craw.http_code_403, craw.http_code_redirect]:
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404}) actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': status_code})
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...') logging.warning(f'get page http code {status_code}. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
elif status_code and status_code == craw.http_code_redirect:
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_redirect})
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False need_insert = False
break break
else: else:
@ -414,6 +410,7 @@ def fetch_movies_detail():
curr_id = movie['id'] curr_id = movie['id']
uncensored = int(movie['uncensored']) uncensored = int(movie['uncensored'])
if not utils.is_valid_url(url): if not utils.is_valid_url(url):
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
logging.info(f'invalid url ({url}), row id: {curr_id}. skipping...') logging.info(f'invalid url ({url}), row id: {curr_id}. skipping...')
continue continue
@ -442,12 +439,9 @@ def fetch_movies_detail():
logging.warning(f'parse_page_movie error. url: {url}') logging.warning(f'parse_page_movie error. url: {url}')
time.sleep(2) time.sleep(2)
elif status_code and status_code == craw.http_code_404: elif status_code and status_code in [craw.http_code_404, craw.http_code_403, craw.http_code_redirect]:
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404}) movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': status_code})
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') logging.warning(f'get page http code {status_code}. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
elif status_code and status_code == craw.http_code_redirect:
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_redirect})
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else: else:
logging.warning(f'fetch_page error. url: {url}') logging.warning(f'fetch_page error. url: {url}')
time.sleep(0.5) time.sleep(0.5)
@ -456,6 +450,9 @@ def fetch_movies_detail():
if debug: if debug:
return True return True
# 重置 movies 表的 uncensored 标志位
def reset_movies_uncensored():
db_tools.reset_movies_uncensored(check_and_do=0 if debug else 1)
# 建立缩写到函数的映射 # 建立缩写到函数的映射
function_map = { function_map = {
@ -467,6 +464,7 @@ function_map = {
"movies" : fetch_movies_detail, "movies" : fetch_movies_detail,
"langs" : update_multi_langs, "langs" : update_multi_langs,
"tags" : update_multilang_tags, "tags" : update_multilang_tags,
"reset_un" : reset_movies_uncensored
} }
# 主函数 # 主函数

View File

@ -2,9 +2,7 @@ import json
import time import time
import src.db_utils.sqlite_db as sqlite_db import src.db_utils.sqlite_db as sqlite_db
import src.utils.utils as utils import src.utils.utils as utils
import src.logger.logger as logger
logger.setup_logging()
db_tools = sqlite_db.JavbusDBHandler() db_tools = sqlite_db.JavbusDBHandler()
if __name__ == "__main__": if __name__ == "__main__":
@ -12,5 +10,4 @@ if __name__ == "__main__":
result = db_tools.get_statics() result = db_tools.get_statics()
utils.pretty_print_json(result) utils.pretty_print_json(result)
db_tools.reset_movies_uncensored()