modify scripts

This commit is contained in:
oscarz
2025-04-23 17:43:30 +08:00
parent f6385b83e4
commit 2e2218b623
2 changed files with 50 additions and 5 deletions

View File

@ -160,6 +160,8 @@ def fetch_performers_detail():
if len(perfomers_list) < 1:
logging.info(f'all performers fetched.')
break
succ_rows = 0
for performer in perfomers_list:
url = performer['href']
person = performer['name']
@ -168,8 +170,9 @@ def fetch_performers_detail():
next_url = url
all_movies = []
need_insert = True
while next_url:
logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class"))
if soup:
data, next_url = scraper.parse_actor_detail(soup, next_url)
@ -179,11 +182,22 @@ def fetch_performers_detail():
all_movies.extend(data.get('movies', []))
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=2)
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
elif status_code and status_code == 401:
actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=3)
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
else:
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
logging.warning(f'fetch_page error. url: {url}')
# 如果出现了401或者404已经处理直接跳过
if not need_insert:
continue
# 获取完了个人的所有影片,开始插入数据
performer_id = db_tools.insert_or_update_actor({
'href': url,
@ -193,11 +207,14 @@ def fetch_performers_detail():
'credits':all_movies
})
if performer_id:
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
last_perfomer_id = performer_id
succ_rows += 1
else:
logging.warning(f'insert person: ({person}) {url} failed.')
time.sleep(0.5)
logging.info(f'total request: {len(perfomers_list)}, succ: {succ_rows}, last performer id: {last_perfomer_id}')
# 调试break
if debug:
return True

View File

@ -172,6 +172,34 @@ def insert_or_update_actor(actor):
logging.error(f"插入/更新演员 {actor['name']} 失败: {e}")
conn.rollback()
# """插入或更新电影数据(异常url的处理比如404链接)"""
def insert_or_update_actor_404(name, href, is_full_data=1):
try:
# 插入或更新电影信息
cursor.execute(
"""
INSERT INTO javdb_actors (name, href, is_full_data, updated_at)
VALUES (?, ?, ?, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name=excluded.name, is_full_data=excluded.is_full_data, updated_at = datetime('now', 'localtime')
""",
(name, href, is_full_data)
)
conn.commit()
# 获取插入的 movie_id
actor_id = get_id_by_href('javdb_actors', href)
if actor_id is None:
return None
return actor_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 删除演员
def delete_actor_by_href(href):
try: