modify scripts
This commit is contained in:
@ -262,7 +262,8 @@ def fetch_performers_detail_once(perfomers_list):
|
||||
else:
|
||||
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url)
|
||||
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||
time.sleep(1)
|
||||
@ -305,10 +306,11 @@ def fetch_movies_detail():
|
||||
logging.info(f'all movies fetched.')
|
||||
break
|
||||
last_movie_id = 0
|
||||
succ_count = 0
|
||||
for movie in movies_list:
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.info(f"Fetching data for movie ({title}), url {url} ...")
|
||||
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||
@ -322,6 +324,7 @@ def fetch_movies_detail():
|
||||
if movie_id:
|
||||
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||||
last_movie_id = movie_id
|
||||
succ_count += 1
|
||||
else:
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
|
||||
@ -330,11 +333,13 @@ def fetch_movies_detail():
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
# 标记为已处理
|
||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url)
|
||||
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
time.sleep(1)
|
||||
logging.info(f'insert {len(movies_list)} movies. last movie id: {last_movie_id}')
|
||||
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
|
||||
# 调试增加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
@ -52,6 +52,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 过期的网页,与404相同处理
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
logging.warning(f"invalid or outdated page: {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||
|
||||
|
||||
@ -251,6 +251,39 @@ def insert_or_update_performer(data):
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
||||
def insert_or_update_performer_404(name, href):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_performers (href, name, is_full_data, updated_at)
|
||||
VALUES (?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
is_full_data = 1,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (
|
||||
href, name
|
||||
))
|
||||
|
||||
# 获取 performer_id
|
||||
performer_id = get_id_by_href('iafd_performers', href)
|
||||
if performer_id is None:
|
||||
return None
|
||||
logging.debug(f'insert one performer, id: {performer_id}, name: {name}, href: {href}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 按 id 或 href 删除演员
|
||||
def delete_performer(identifier):
|
||||
try:
|
||||
@ -610,6 +643,35 @@ def insert_or_update_movie(movie_data):
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
||||
def insert_or_update_movie_404(title, href):
|
||||
try:
|
||||
# 插入或更新电影信息
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO iafd_movies (title, href, is_full_data, updated_at)
|
||||
VALUES (?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime')
|
||||
""",
|
||||
(title, href)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# 获取插入的 movie_id
|
||||
movie_id = get_id_by_href('iafd_movies', href)
|
||||
if movie_id is None:
|
||||
return None
|
||||
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# 删除电影数据"""
|
||||
def delete_movie(identifier):
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user