modify scripts
This commit is contained in:
@ -247,7 +247,7 @@ def fetch_performers_detail_once(perfomers_list):
|
|||||||
logging.debug(f"Fetching data for performer ({person}), url {url} ...")
|
logging.debug(f"Fetching data for performer ({person}), url {url} ...")
|
||||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||||
# 从本地读取的文件,忽略
|
# 从本地读取的文件,忽略
|
||||||
if skip_local and status_code == 99 :
|
if skip_local and status_code == scraper.http_code_local :
|
||||||
last_performer_id = curr_id
|
last_performer_id = curr_id
|
||||||
continue
|
continue
|
||||||
if soup:
|
if soup:
|
||||||
@ -272,11 +272,11 @@ def fetch_performers_detail_once(perfomers_list):
|
|||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == scraper.http_code_404:
|
||||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=2)
|
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_404)
|
||||||
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
||||||
elif status_code and status_code == 601:
|
elif status_code and status_code == scraper.http_code_url:
|
||||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=3)
|
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_url)
|
||||||
logging.warning(f'601 page(wrong url). id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
logging.warning(f'601 page(wrong url). id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||||
@ -293,7 +293,7 @@ def fetch_performers_detail():
|
|||||||
# 获取新演员的列表
|
# 获取新演员的列表
|
||||||
while True:
|
while True:
|
||||||
if force: # 从头逐个遍历
|
if force: # 从头逐个遍历
|
||||||
perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
|
perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count)
|
||||||
else: # 只做更新
|
else: # 只做更新
|
||||||
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
|
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
|
||||||
if len(perfomers_list) < 1:
|
if len(perfomers_list) < 1:
|
||||||
@ -322,7 +322,7 @@ def fetch_movies_detail():
|
|||||||
last_movie_id = 0
|
last_movie_id = 0
|
||||||
while True:
|
while True:
|
||||||
if force: # 从头逐个遍历
|
if force: # 从头逐个遍历
|
||||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
|
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count)
|
||||||
else: # 只做更新
|
else: # 只做更新
|
||||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
||||||
if len(movies_list) < 1:
|
if len(movies_list) < 1:
|
||||||
@ -336,8 +336,9 @@ def fetch_movies_detail():
|
|||||||
logging.debug(f"Fetching data for movie: {curr_id}: ({title}), url {url} ...")
|
logging.debug(f"Fetching data for movie: {curr_id}: ({title}), url {url} ...")
|
||||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||||
# 从本地读取的文件,忽略
|
# 从本地读取的文件,忽略
|
||||||
if skip_local and status_code == 99 :
|
if skip_local and status_code == scraper.http_code_local :
|
||||||
last_movie_id = curr_id
|
last_movie_id = curr_id
|
||||||
|
succ_count += 1
|
||||||
continue
|
continue
|
||||||
if soup:
|
if soup:
|
||||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||||
@ -359,13 +360,13 @@ def fetch_movies_detail():
|
|||||||
utils.write_movie_json(url, movie_data)
|
utils.write_movie_json(url, movie_data)
|
||||||
else:
|
else:
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == scraper.http_code_404:
|
||||||
# 标记为已处理
|
# 标记为已处理
|
||||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2)
|
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
|
||||||
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||||
elif status_code and status_code == 601:
|
elif status_code and status_code == scraper.http_code_url:
|
||||||
# 标记为已处理
|
# 标记为已处理
|
||||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3)
|
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_url)
|
||||||
logging.warning(f'601 page(wrong url). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
logging.warning(f'601 page(wrong url). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. url: {url}')
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
|
|||||||
@ -36,6 +36,11 @@ headers = {
|
|||||||
}
|
}
|
||||||
scraper = cloudscraper.create_scraper()
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
|
http_code_404 = 404
|
||||||
|
http_code_login = 401
|
||||||
|
http_code_url = 601
|
||||||
|
http_code_local = 99
|
||||||
|
|
||||||
save_raw_html = True
|
save_raw_html = True
|
||||||
load_from_local = True
|
load_from_local = True
|
||||||
|
|
||||||
@ -49,27 +54,27 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
|
|
||||||
soup = BeautifulSoup(html_text, parser)
|
soup = BeautifulSoup(html_text, parser)
|
||||||
if validator(soup): # 进行自定义页面检查
|
if validator(soup): # 进行自定义页面检查
|
||||||
return soup, 99 # 返回一个小于100的错误码,表明是从本地返回的
|
return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
if host_url not in url.lower():
|
if host_url not in url.lower():
|
||||||
logging.error(f'wrong url format: {url}')
|
logging.error(f'wrong url format: {url}')
|
||||||
return None, 601
|
return None, http_code_url
|
||||||
|
|
||||||
response = scraper.get(url, headers=headers)
|
response = scraper.get(url, headers=headers)
|
||||||
|
|
||||||
# 处理 HTTP 状态码
|
# 处理 HTTP 状态码
|
||||||
if response.status_code == 404:
|
if response.status_code == 404:
|
||||||
logging.debug(f"Page not found (404): {url}")
|
logging.debug(f"Page not found (404): {url}")
|
||||||
return None, 404 # 直接返回 404,调用方可以跳过
|
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
response.raise_for_status() # 处理 HTTP 错误
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
# 过期的网页,与404相同处理
|
# 过期的网页,与404相同处理
|
||||||
if "invalid or outdated page" in response.text.lower():
|
if "invalid or outdated page" in response.text.lower():
|
||||||
logging.debug(f"invalid or outdated page: {url}")
|
logging.debug(f"invalid or outdated page: {url}")
|
||||||
return None, 404 # 直接返回 404,调用方可以跳过
|
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
if save_raw_html:
|
if save_raw_html:
|
||||||
utils.write_raw_html(url, response.text)
|
utils.write_raw_html(url, response.text)
|
||||||
|
|||||||
@ -14,7 +14,9 @@ config.setup_logging()
|
|||||||
|
|
||||||
debug = False
|
debug = False
|
||||||
force = False
|
force = False
|
||||||
skip_local = True
|
skip_local = False
|
||||||
|
from_actor = False
|
||||||
|
abnormal_only = False
|
||||||
|
|
||||||
# 获取演员列表
|
# 获取演员列表
|
||||||
def fetch_actor_list():
|
def fetch_actor_list():
|
||||||
@ -152,10 +154,20 @@ def fetch_performers_detail():
|
|||||||
limit_count = 5 if debug else 100
|
limit_count = 5 if debug else 100
|
||||||
perfomers_list = []
|
perfomers_list = []
|
||||||
last_perfomer_id = 0
|
last_perfomer_id = 0
|
||||||
|
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
|
||||||
while True:
|
while True:
|
||||||
# 每次从数据库中取一部分,避免一次全量获取
|
# 每次从数据库中取一部分,避免一次全量获取
|
||||||
if force: # 从头逐个遍历
|
if force: # 从头逐个遍历
|
||||||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
|
if from_actor:
|
||||||
|
if abnormal_only:
|
||||||
|
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||||
|
else:
|
||||||
|
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||||
|
else:
|
||||||
|
if abnormal_only:
|
||||||
|
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||||||
|
else:
|
||||||
|
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||||||
else: # 只做更新
|
else: # 只做更新
|
||||||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
|
perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
|
||||||
if len(perfomers_list) < 1:
|
if len(perfomers_list) < 1:
|
||||||
@ -182,13 +194,13 @@ def fetch_performers_detail():
|
|||||||
alias = data.get('alias', [])
|
alias = data.get('alias', [])
|
||||||
all_movies.extend(data.get('movies', []))
|
all_movies.extend(data.get('movies', []))
|
||||||
|
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == scraper.http_code_404:
|
||||||
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=2)
|
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404)
|
||||||
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||||||
need_insert = False
|
need_insert = False
|
||||||
break
|
break
|
||||||
elif status_code and status_code == 401:
|
elif status_code and status_code == scraper.http_code_login:
|
||||||
actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=3)
|
actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=scraper.http_code_login)
|
||||||
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||||||
need_insert = False
|
need_insert = False
|
||||||
break
|
break
|
||||||
@ -225,9 +237,19 @@ def fetch_movies_detail():
|
|||||||
limit_count = 10 if debug else 100
|
limit_count = 10 if debug else 100
|
||||||
movies_list = []
|
movies_list = []
|
||||||
last_movie_id = 0
|
last_movie_id = 0
|
||||||
|
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
|
||||||
while True:
|
while True:
|
||||||
if force: # 从头逐个遍历
|
if force: # 从头逐个遍历
|
||||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
|
if from_actor:
|
||||||
|
if abnormal_only:
|
||||||
|
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||||
|
else:
|
||||||
|
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||||
|
else:
|
||||||
|
if abnormal_only:
|
||||||
|
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||||||
|
else:
|
||||||
|
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||||||
else: # 只做更新
|
else: # 只做更新
|
||||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
||||||
if len(movies_list) < 1:
|
if len(movies_list) < 1:
|
||||||
@ -241,8 +263,9 @@ def fetch_movies_detail():
|
|||||||
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
|
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
|
||||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
||||||
# 从本地读取的文件,忽略
|
# 从本地读取的文件,忽略
|
||||||
if skip_local and status_code == 99 :
|
if skip_local and status_code == scraper.http_code_local :
|
||||||
last_movie_id = curr_id
|
last_movie_id = curr_id
|
||||||
|
succ_count += 1
|
||||||
continue
|
continue
|
||||||
# 解析页面,写入数据库
|
# 解析页面,写入数据库
|
||||||
if soup:
|
if soup:
|
||||||
@ -258,11 +281,11 @@ def fetch_movies_detail():
|
|||||||
else:
|
else:
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == scraper.http_code_404:
|
||||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2)
|
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
|
||||||
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||||
elif status_code and status_code == 401:
|
elif status_code and status_code == scraper.http_code_login:
|
||||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3)
|
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login)
|
||||||
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. url: {url}')
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
@ -285,12 +308,12 @@ function_map = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# 主函数
|
# 主函数
|
||||||
def main(cmd, args_debug, args_force, args_skip_local):
|
def main(cmd, args_debug, args_force, args_skip_local, args_from_actor, args_abnormal_only):
|
||||||
global debug
|
global debug
|
||||||
debug = args_debug
|
debug = args_debug
|
||||||
if debug:
|
if debug:
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
#logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
global force
|
global force
|
||||||
force = args_force
|
force = args_force
|
||||||
@ -298,6 +321,12 @@ def main(cmd, args_debug, args_force, args_skip_local):
|
|||||||
global skip_local
|
global skip_local
|
||||||
skip_local = args_skip_local
|
skip_local = args_skip_local
|
||||||
|
|
||||||
|
global from_actor
|
||||||
|
from_actor = args_from_actor
|
||||||
|
|
||||||
|
global abnormal_only
|
||||||
|
abnormal_only = args_abnormal_only
|
||||||
|
|
||||||
# 开启任务
|
# 开启任务
|
||||||
task_id = db_tools.insert_task_log()
|
task_id = db_tools.insert_task_log()
|
||||||
if task_id is None:
|
if task_id is None:
|
||||||
@ -339,6 +368,8 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||||
parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)')
|
parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)')
|
||||||
|
parser.add_argument('--from_actor', action='store_true', help='只遍历来自 actor_list 的 演员或者影片 (在force模式下有效)')
|
||||||
|
parser.add_argument('--abnormal_only', action='store_true', help='只遍历异常URL(404或者需要登陆查看等) 的 演员或影片 (在force模式下有效)')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
main(args.cmd, args.debug, args.force, args.skip_local)
|
main(args.cmd, args.debug, args.force, args.skip_local, args.from_actor, args.abnormal_only)
|
||||||
|
|||||||
@ -25,6 +25,10 @@ headers = {
|
|||||||
}
|
}
|
||||||
scraper = cloudscraper.create_scraper()
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
|
http_code_404 = 404
|
||||||
|
http_code_login = 401
|
||||||
|
http_code_local = 99
|
||||||
|
|
||||||
save_raw_html = True
|
save_raw_html = True
|
||||||
load_from_local = True
|
load_from_local = True
|
||||||
|
|
||||||
@ -38,8 +42,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
|
|
||||||
soup = BeautifulSoup(html_text, parser)
|
soup = BeautifulSoup(html_text, parser)
|
||||||
if validator(soup): # 进行自定义页面检查
|
if validator(soup): # 进行自定义页面检查
|
||||||
logging.info(f"read from local. href: {url}")
|
logging.debug(f"read from local. href: {url}")
|
||||||
return soup, 99 # 返回一个小于100的错误码,表明是从本地返回的
|
return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
@ -51,8 +55,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
|
|
||||||
# 处理 HTTP 状态码
|
# 处理 HTTP 状态码
|
||||||
if response.status_code == 404:
|
if response.status_code == 404:
|
||||||
logging.warning(f"Page not found (404): {url}")
|
logging.debug(f"Page not found (404): {url}")
|
||||||
return None, 404 # 直接返回 404,调用方可以跳过
|
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
response.raise_for_status() # 处理 HTTP 错误
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
@ -62,8 +66,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
soup = BeautifulSoup(response.text, parser)
|
soup = BeautifulSoup(response.text, parser)
|
||||||
# 判断是否为登录页面,
|
# 判断是否为登录页面,
|
||||||
if soup.find('nav', class_='panel form-panel'):
|
if soup.find('nav', class_='panel form-panel'):
|
||||||
logging.warning(f"Page redirected to login page on {url}.")
|
logging.debug(f"Page redirected to login page on {url}.")
|
||||||
return None, 401
|
return None, http_code_login
|
||||||
|
|
||||||
if save_raw_html:
|
if save_raw_html:
|
||||||
utils.write_raw_html(url, response.text)
|
utils.write_raw_html(url, response.text)
|
||||||
|
|||||||
Reference in New Issue
Block a user