modify scripts

This commit is contained in:
2025-03-16 15:19:52 +08:00
parent e136de53f2
commit dab493f8e7
5 changed files with 80 additions and 7 deletions

View File

@ -262,7 +262,8 @@ def fetch_performers_detail_once(perfomers_list):
else:
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url)
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
else:
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
time.sleep(1)
@ -305,10 +306,11 @@ def fetch_movies_detail():
logging.info(f'all movies fetched.')
break
last_movie_id = 0
succ_count = 0
for movie in movies_list:
url = movie['href']
title = movie['title']
logging.info(f"Fetching data for movie ({title}), url {url} ...")
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
if soup:
movie_data = scraper.parse_page_movie(soup, url, title)
@ -322,6 +324,7 @@ def fetch_movies_detail():
if movie_id:
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
last_movie_id = movie_id
succ_count += 1
else:
logging.warning(f'insert movie {url} failed.')
@ -330,11 +333,13 @@ def fetch_movies_detail():
else:
logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
# 标记为已处理
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url)
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else:
logging.warning(f'fetch_page error. url: {url}')
time.sleep(1)
logging.info(f'insert {len(movies_list)} movies. last movie id: {last_movie_id}')
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
# 调试增加break
if debug:
return True

View File

@ -52,6 +52,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
response.raise_for_status() # 处理 HTTP 错误
# 过期的网页与404相同处理
if "invalid or outdated page" in response.text.lower():
logging.warning(f"invalid or outdated page: {url}")
return None, 404 # 直接返回 404调用方可以跳过
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text

View File

@ -251,6 +251,39 @@ def insert_or_update_performer(data):
logging.error(f"未知错误: {e}")
return None
# """插入或更新电影数据(异常url的处理比如404链接)"""
def insert_or_update_performer_404(name, href):
try:
cursor.execute("""
INSERT INTO iafd_performers (href, name, is_full_data, updated_at)
VALUES (?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
is_full_data = 1,
updated_at = datetime('now', 'localtime')
""", (
href, name
))
# 获取 performer_id
performer_id = get_id_by_href('iafd_performers', href)
if performer_id is None:
return None
logging.debug(f'insert one performer, id: {performer_id}, name: {name}, href: {href}')
return performer_id
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
logging.error(f"未知错误: {e}")
return None
# 按 id 或 href 删除演员
def delete_performer(identifier):
try:
@ -610,6 +643,35 @@ def insert_or_update_movie(movie_data):
logging.error("Error inserting movie: %s", e)
return None
# """插入或更新电影数据(异常url的处理比如404链接)"""
def insert_or_update_movie_404(title, href):
try:
# 插入或更新电影信息
cursor.execute(
"""
INSERT INTO iafd_movies (title, href, is_full_data, updated_at)
VALUES (?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime')
""",
(title, href)
)
conn.commit()
# 获取插入的 movie_id
movie_id = get_id_by_href('iafd_movies', href)
if movie_id is None:
return None
return movie_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 删除电影数据"""
def delete_movie(identifier):
try: