modify scripts
This commit is contained in:
@ -2,11 +2,13 @@ import logging
|
|||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
import src.utils.utils as utils
|
import src.utils.utils as utils
|
||||||
|
|
||||||
http_code_404 = 404
|
http_code_404 = 404
|
||||||
|
http_code_403 = 403
|
||||||
http_code_redirect = 401
|
http_code_redirect = 401
|
||||||
http_code_url = 601
|
http_code_url = 601
|
||||||
http_code_local = 99
|
http_code_local = 99
|
||||||
@ -59,10 +61,10 @@ class GenericCrawler:
|
|||||||
response = self.scraper.get(url, headers=self.headers, cookies=self.cookies)
|
response = self.scraper.get(url, headers=self.headers, cookies=self.cookies)
|
||||||
|
|
||||||
# 处理 HTTP 状态码
|
# 处理 HTTP 状态码
|
||||||
if response.status_code == http_code_404:
|
if response.status_code in [http_code_404, http_code_403]:
|
||||||
logging.debug(f"Page not found (404): {url}")
|
logging.debug(f"get http code: {response.status_code}, url: {url}")
|
||||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
return None, response.status_code # 直接返回,调用方可以跳过
|
||||||
|
|
||||||
response.raise_for_status() # 处理 HTTP 错误
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
# 检查是否发生跳转,比如到登录页面
|
# 检查是否发生跳转,比如到登录页面
|
||||||
@ -86,6 +88,7 @@ class GenericCrawler:
|
|||||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Unexpected error on {url}: {e}, Retrying...")
|
logging.error(f"Unexpected error on {url}: {e}, Retrying...")
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
logging.error(f'Fetching failed after max retries. {url}')
|
logging.error(f'Fetching failed after max retries. {url}')
|
||||||
return None, None # 达到最大重试次数仍然失败
|
return None, None # 达到最大重试次数仍然失败
|
||||||
|
|||||||
@ -634,7 +634,7 @@ class JavbusDBHandler(DatabaseHandler):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
# 处理影片的 无码 字段
|
# 处理影片的 无码 字段
|
||||||
def reset_movies_uncensored(self):
|
def reset_movies_uncensored(self, check_and_do = 0):
|
||||||
try:
|
try:
|
||||||
logging.info("创建临时表以便于保存待更新记录")
|
logging.info("创建临时表以便于保存待更新记录")
|
||||||
self.cursor.execute("""
|
self.cursor.execute("""
|
||||||
@ -669,24 +669,26 @@ class JavbusDBHandler(DatabaseHandler):
|
|||||||
total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0]
|
total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0]
|
||||||
logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片")
|
logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片")
|
||||||
|
|
||||||
# 1. 将所有记录的uncensored默认设为0
|
if check_and_do:
|
||||||
logging.info("开始将所有影片的uncensored设为默认值0...")
|
# 1. 将所有记录的uncensored默认设为0
|
||||||
self.cursor.execute("UPDATE javbus_movies SET uncensored = 0")
|
logging.info("开始将所有影片的uncensored设为默认值0...")
|
||||||
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为0")
|
self.cursor.execute("UPDATE javbus_movies SET uncensored = 0")
|
||||||
|
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为0")
|
||||||
|
|
||||||
# 2. 将临时表中匹配的记录设为1
|
# 2. 将临时表中匹配的记录设为1
|
||||||
logging.info("开始将匹配的影片的uncensored设为1...")
|
logging.info("开始将匹配的影片的uncensored设为1...")
|
||||||
self.cursor.execute("""
|
self.cursor.execute("""
|
||||||
UPDATE javbus_movies
|
UPDATE javbus_movies
|
||||||
SET uncensored = 1
|
SET uncensored = 1
|
||||||
WHERE id IN (SELECT movie_id FROM temp_movies_to_update)
|
WHERE id IN (SELECT movie_id FROM temp_movies_to_update)
|
||||||
""")
|
""")
|
||||||
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1")
|
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1")
|
||||||
|
|
||||||
# 3. 清理临时表,也可以不清理,以便于抽检
|
self.conn.commit()
|
||||||
|
else:
|
||||||
|
logging.info("check完毕,本次忽略更新。。。")
|
||||||
|
|
||||||
self.conn.commit()
|
logging.info("任务执行完成!")
|
||||||
logging.info("所有更新已提交")
|
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
self.conn.rollback()
|
self.conn.rollback()
|
||||||
|
|||||||
@ -315,7 +315,8 @@ def fetch_performers_detail():
|
|||||||
uncensored = int(performer['uncensored'])
|
uncensored = int(performer['uncensored'])
|
||||||
avatar = None
|
avatar = None
|
||||||
if not utils.is_valid_url(url):
|
if not utils.is_valid_url(url):
|
||||||
logging.info(f'invalid url ({url}), name: {person}. skipping...')
|
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404})
|
||||||
|
logging.info(f'invalid url ({url}), id: {actor_id}, name: {person}. skipping...')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
next_url = url
|
next_url = url
|
||||||
@ -331,14 +332,9 @@ def fetch_performers_detail():
|
|||||||
avatar = data.get('avatar')
|
avatar = data.get('avatar')
|
||||||
all_movies.extend(data.get('movies', []))
|
all_movies.extend(data.get('movies', []))
|
||||||
|
|
||||||
elif status_code and status_code == craw.http_code_404:
|
elif status_code and status_code in [craw.http_code_404, craw.http_code_403, craw.http_code_redirect]:
|
||||||
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404})
|
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': status_code})
|
||||||
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
logging.warning(f'get page http code {status_code}. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||||||
need_insert = False
|
|
||||||
break
|
|
||||||
elif status_code and status_code == craw.http_code_redirect:
|
|
||||||
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_redirect})
|
|
||||||
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
|
||||||
need_insert = False
|
need_insert = False
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -414,6 +410,7 @@ def fetch_movies_detail():
|
|||||||
curr_id = movie['id']
|
curr_id = movie['id']
|
||||||
uncensored = int(movie['uncensored'])
|
uncensored = int(movie['uncensored'])
|
||||||
if not utils.is_valid_url(url):
|
if not utils.is_valid_url(url):
|
||||||
|
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
|
||||||
logging.info(f'invalid url ({url}), row id: {curr_id}. skipping...')
|
logging.info(f'invalid url ({url}), row id: {curr_id}. skipping...')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -442,12 +439,9 @@ def fetch_movies_detail():
|
|||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
elif status_code and status_code == craw.http_code_404:
|
elif status_code and status_code in [craw.http_code_404, craw.http_code_403, craw.http_code_redirect]:
|
||||||
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
|
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': status_code})
|
||||||
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
logging.warning(f'get page http code {status_code}. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||||
elif status_code and status_code == craw.http_code_redirect:
|
|
||||||
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_redirect})
|
|
||||||
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. url: {url}')
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
@ -456,7 +450,10 @@ def fetch_movies_detail():
|
|||||||
if debug:
|
if debug:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# 重置 movies 表的 uncensored 标志位
|
||||||
|
def reset_movies_uncensored():
|
||||||
|
db_tools.reset_movies_uncensored(check_and_do=0 if debug else 1)
|
||||||
|
|
||||||
# 建立缩写到函数的映射
|
# 建立缩写到函数的映射
|
||||||
function_map = {
|
function_map = {
|
||||||
"actor_list": fetch_actor_list,
|
"actor_list": fetch_actor_list,
|
||||||
@ -467,6 +464,7 @@ function_map = {
|
|||||||
"movies" : fetch_movies_detail,
|
"movies" : fetch_movies_detail,
|
||||||
"langs" : update_multi_langs,
|
"langs" : update_multi_langs,
|
||||||
"tags" : update_multilang_tags,
|
"tags" : update_multilang_tags,
|
||||||
|
"reset_un" : reset_movies_uncensored
|
||||||
}
|
}
|
||||||
|
|
||||||
# 主函数
|
# 主函数
|
||||||
|
|||||||
@ -2,9 +2,7 @@ import json
|
|||||||
import time
|
import time
|
||||||
import src.db_utils.sqlite_db as sqlite_db
|
import src.db_utils.sqlite_db as sqlite_db
|
||||||
import src.utils.utils as utils
|
import src.utils.utils as utils
|
||||||
import src.logger.logger as logger
|
|
||||||
|
|
||||||
logger.setup_logging()
|
|
||||||
db_tools = sqlite_db.JavbusDBHandler()
|
db_tools = sqlite_db.JavbusDBHandler()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -12,5 +10,4 @@ if __name__ == "__main__":
|
|||||||
result = db_tools.get_statics()
|
result = db_tools.get_statics()
|
||||||
utils.pretty_print_json(result)
|
utils.pretty_print_json(result)
|
||||||
|
|
||||||
db_tools.reset_movies_uncensored()
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user