modify scripts
This commit is contained in:
@ -9,12 +9,13 @@ from functools import partial
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import src.config.config as config
|
||||
import src.logger.logger as logger
|
||||
import src.db_utils.db_javbus as db_tools
|
||||
import src.crawling.craw_common as scraper_base
|
||||
import src.crawling.craw_javbus as scraper
|
||||
import src.db_utils.sqlite_db as sqlite_db
|
||||
import src.crawling.craw as craw
|
||||
import src.utils.utils as utils
|
||||
|
||||
logger.setup_logging()
|
||||
db_tools = sqlite_db.JavbusDBHandler()
|
||||
scraper = craw.JavbusCrawler()
|
||||
|
||||
debug = False
|
||||
skip_local = False
|
||||
@ -34,7 +35,7 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
||||
num = 1
|
||||
while current_url:
|
||||
logging.info(f"fetching url {current_url}")
|
||||
soup, status_code = scraper_base.fetch_page(current_url, partial(scraper_base.generic_validator, tag="div", identifier="waterfall", attr_type="id"), headers=scraper.headers, cookies=scraper.cookies)
|
||||
soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||
if soup:
|
||||
list_data, current_url = scraper.parse_actors_list(soup, current_url)
|
||||
if list_data :
|
||||
@ -50,9 +51,12 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {current_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
elif status_code :
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {current_url}')
|
||||
break
|
||||
else: # 达到失败上限,加上休眠继续重试
|
||||
time.sleep(5)
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
# 调试break
|
||||
@ -62,36 +66,13 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
||||
# 获取演员列表
|
||||
def fetch_actor_list():
|
||||
#for lang in ["en", "ja", "zh"]:
|
||||
for lang in ['ja']:
|
||||
for lang in ['en']:
|
||||
fetch_actor_list_lang(lang=lang, uncensored=1)
|
||||
|
||||
#for lang in ["en", "ja", "zh"]:
|
||||
for lang in ['ja']:
|
||||
for lang in ['en']:
|
||||
fetch_actor_list_lang(lang=lang)
|
||||
|
||||
|
||||
# 获取演员列表
|
||||
def fetch_actor_list2():
|
||||
next_url = scraper.actors_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
# 写入数据库
|
||||
for row in list_data:
|
||||
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
|
||||
if actor_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 获取makers列表
|
||||
def fetch_makers_list():
|
||||
next_url = scraper.makers_uncensored_base_url
|
||||
|
||||
Reference in New Issue
Block a user