modify scripts
This commit is contained in:
@ -160,6 +160,8 @@ def fetch_performers_detail():
|
|||||||
if len(perfomers_list) < 1:
|
if len(perfomers_list) < 1:
|
||||||
logging.info(f'all performers fetched.')
|
logging.info(f'all performers fetched.')
|
||||||
break
|
break
|
||||||
|
|
||||||
|
succ_rows = 0
|
||||||
for performer in perfomers_list:
|
for performer in perfomers_list:
|
||||||
url = performer['href']
|
url = performer['href']
|
||||||
person = performer['name']
|
person = performer['name']
|
||||||
@ -168,8 +170,9 @@ def fetch_performers_detail():
|
|||||||
|
|
||||||
next_url = url
|
next_url = url
|
||||||
all_movies = []
|
all_movies = []
|
||||||
|
need_insert = True
|
||||||
while next_url:
|
while next_url:
|
||||||
logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
|
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
|
||||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class"))
|
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class"))
|
||||||
if soup:
|
if soup:
|
||||||
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
||||||
@ -179,11 +182,22 @@ def fetch_performers_detail():
|
|||||||
all_movies.extend(data.get('movies', []))
|
all_movies.extend(data.get('movies', []))
|
||||||
|
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == 404:
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
|
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=2)
|
||||||
|
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||||||
|
need_insert = False
|
||||||
|
break
|
||||||
|
elif status_code and status_code == 401:
|
||||||
|
actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=3)
|
||||||
|
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||||||
|
need_insert = False
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
|
|
||||||
|
# 如果出现了401或者404,已经处理,直接跳过
|
||||||
|
if not need_insert:
|
||||||
|
continue
|
||||||
|
|
||||||
# 获取完了个人的所有影片,开始插入数据
|
# 获取完了个人的所有影片,开始插入数据
|
||||||
performer_id = db_tools.insert_or_update_actor({
|
performer_id = db_tools.insert_or_update_actor({
|
||||||
'href': url,
|
'href': url,
|
||||||
@ -193,11 +207,14 @@ def fetch_performers_detail():
|
|||||||
'credits':all_movies
|
'credits':all_movies
|
||||||
})
|
})
|
||||||
if performer_id:
|
if performer_id:
|
||||||
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||||||
last_perfomer_id = performer_id
|
last_perfomer_id = performer_id
|
||||||
|
succ_rows += 1
|
||||||
else:
|
else:
|
||||||
logging.warning(f'insert person: ({person}) {url} failed.')
|
logging.warning(f'insert person: ({person}) {url} failed.')
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
logging.info(f'total request: {len(perfomers_list)}, succ: {succ_rows}, last performer id: {last_perfomer_id}')
|
||||||
# 调试break
|
# 调试break
|
||||||
if debug:
|
if debug:
|
||||||
return True
|
return True
|
||||||
|
|||||||
@ -172,6 +172,34 @@ def insert_or_update_actor(actor):
|
|||||||
logging.error(f"插入/更新演员 {actor['name']} 失败: {e}")
|
logging.error(f"插入/更新演员 {actor['name']} 失败: {e}")
|
||||||
conn.rollback()
|
conn.rollback()
|
||||||
|
|
||||||
|
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
||||||
|
def insert_or_update_actor_404(name, href, is_full_data=1):
|
||||||
|
try:
|
||||||
|
# 插入或更新电影信息
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO javdb_actors (name, href, is_full_data, updated_at)
|
||||||
|
VALUES (?, ?, ?, datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
name=excluded.name, is_full_data=excluded.is_full_data, updated_at = datetime('now', 'localtime')
|
||||||
|
""",
|
||||||
|
(name, href, is_full_data)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 获取插入的 movie_id
|
||||||
|
actor_id = get_id_by_href('javdb_actors', href)
|
||||||
|
if actor_id is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return actor_id
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error inserting movie: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# 删除演员
|
# 删除演员
|
||||||
def delete_actor_by_href(href):
|
def delete_actor_by_href(href):
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user