modify scripts

This commit is contained in:
oscarz
2025-06-24 11:39:29 +08:00
parent 12c53b043d
commit c5feab2c22
7 changed files with 808 additions and 1773 deletions

View File

@ -9,12 +9,13 @@ from functools import partial
from urllib.parse import urljoin, urlparse
import src.config.config as config
import src.logger.logger as logger
import src.db_utils.db_javbus as db_tools
import src.crawling.craw_common as scraper_base
import src.crawling.craw_javbus as scraper
import src.db_utils.sqlite_db as sqlite_db
import src.crawling.craw as craw
import src.utils.utils as utils
logger.setup_logging()
db_tools = sqlite_db.JavbusDBHandler()
scraper = craw.JavbusCrawler()
debug = False
skip_local = False
@ -34,7 +35,7 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
num = 1
while current_url:
logging.info(f"fetching url {current_url}")
soup, status_code = scraper_base.fetch_page(current_url, partial(scraper_base.generic_validator, tag="div", identifier="waterfall", attr_type="id"), headers=scraper.headers, cookies=scraper.cookies)
soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
if soup:
list_data, current_url = scraper.parse_actors_list(soup, current_url)
if list_data :
@ -50,9 +51,12 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
else:
logging.warning(f'fetch actor error. {current_url} ...')
elif status_code and status_code == 404:
elif status_code :
logging.warning(f'fetch page error. httpcode: {status_code}, url: {current_url}')
break
else: # 达到失败上限,加上休眠继续重试
time.sleep(5)
time.sleep(0.3)
# 调试break
@ -62,36 +66,13 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
# 获取演员列表
def fetch_actor_list():
#for lang in ["en", "ja", "zh"]:
for lang in ['ja']:
for lang in ['en']:
fetch_actor_list_lang(lang=lang, uncensored=1)
#for lang in ["en", "ja", "zh"]:
for lang in ['ja']:
for lang in ['en']:
fetch_actor_list_lang(lang=lang)
# 获取演员列表
def fetch_actor_list2():
next_url = scraper.actors_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
if actor_id:
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 获取makers列表
def fetch_makers_list():
next_url = scraper.makers_uncensored_base_url