This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/iafd/src/fetch.py
2025-07-02 09:05:59 +08:00

534 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import time
import csv
import sys
import argparse
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import iafd_scraper as scraper
import utils
from pathlib import Path
# 添加 src 目录到路径
root_dir = str(Path(__file__).resolve().parent.parent.parent)
sys.path.append(root_dir)
from src.monitor.scheduler import CommandScheduler
from src.utils.utils import pretty_print_json
config.setup_logging()
debug = False
force = False
skip_local = True
# 按星座获取演员列表,无翻页
def fetch_performers_by_astro():
for astro in scraper.astro_list:
url = scraper.astr_base_url + astro
logging.info(f"Fetching data for {astro}, url {url} ...")
while True:
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_astro(soup, astro)
if list_data:
all_updated = True
for row in list_data :
# 写入演员数据表
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
if perfomer_id:
logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
else:
logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
all_updated = False
# 全部写成功,才算完成,进行下一页
if all_updated:
break
else:
logging.warning(f'fetch astro error. {url} ...')
time.sleep(0.5)
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
break
else:
logging.warning(f'fetch astro error. {url} ...')
time.sleep(3)
# 调试添加break
if debug:
break
# 按生日获取演员列表,无翻页
def fetch_performers_by_birth():
for month in range(1, 13): # 遍历1到12月
for day in range(1, 32): # 遍历1到31天
url = scraper.birth_base_url.format(month=month, day=day)
logging.info(f"Fetching data for birth, url {url}")
while True:
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_page_birth(soup, month, day)
if list_data:
all_updated = True
for row in list_data :
# 写入演员数据表
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
if perfomer_id:
logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
break
else:
logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
all_updated = False
# 全部写成功,才算完成,进行下一页
if all_updated:
break
else:
logging.warning(f'fetch astro error. {url} ...')
time.sleep(1)
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
break
else:
logging.warning(f'fetch astro error. {url} ...')
time.sleep(3)
# 调试添加break
if debug:
return True
# 更新人种列表
def fetch_ethic_list():
url = scraper.ethnic_list_url
logging.info(f"Fetching data for performer's ethnic list, url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="ethnicity1", attr_type="id"))
if soup:
list_data = scraper.parse_page_ethnic_list(soup, url)
if list_data:
for row in list_data :
dist_id = db_tools.insert_or_update_ethnic({'name': row['name'], 'href': row.get('href', '')})
if dist_id:
logging.debug(f"insert one record into ethnic table. id:{dist_id}, name: {row['name']}, href:{row.get('href', '')}")
else:
logging.warning(f'fetch ethnic error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch page error. {url} ...')
# 按人种获取演员列表,有翻页
def fetch_performers_by_ethnic():
# 先刷新列表
fetch_ethic_list()
ethnic_list = db_tools.query_ethnic_hrefs()
for row in ethnic_list:
url = row['href']
ethnic = row['name']
next_url = url
count = 0
pages = 0
while next_url:
logging.info(f"Fetching data for {ethnic}, url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
parser="lxml", preprocessor=scraper.preprocess_html)
if soup:
list_data, next_page_url = scraper.parse_page_ethnic(soup, ethnic)
if list_data:
all_updated = True
for row in list_data :
# 写入演员数据表
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1)
if perfomer_id:
count += 1
logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
else:
logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
all_updated = False
# 全部写成功,才算完成,进行下一页
if all_updated:
next_url = next_page_url
else:
logging.warning(f'fetch astro error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
break
else:
logging.warning(f'fetch astro error. {next_url} ...')
time.sleep(3)
pages +=1
# 调试添加break
if debug:
return True
logging.info(f"fetched data for {ethnic} total pages: {pages}, total performers: {count}")
# 获取distributors列表
def fetch_distributors_list():
url = scraper.distributors_list_url
logging.info(f"Fetching data for distributors list, url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
if list_data:
for row in list_data :
dis_url = scraper.distributors_base_url + row['href']
dist_id = db_tools.insert_or_update_distributor({'name': row['name'], 'href': dis_url})
if dist_id:
logging.debug(f"insert one record into distributors table. id:{dist_id}, name: {row['name']}, href:{dis_url}")
else:
logging.warning(f'fetch astro error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 获取studios列表
def fetch_studios_list():
url = scraper.studios_list_url
logging.info(f"Fetching data for studios list, url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
if list_data:
for row in list_data :
stu_url = scraper.studios_base_url + row['href']
stu_id = db_tools.insert_or_update_studio({'name': row['name'], 'href': stu_url})
if stu_id:
logging.debug(f"insert one record into studios table. id:{stu_id}, name: {row['name']}, href:{stu_url}")
else:
logging.warning(f'fetch astro error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 更新distributors列表中的影片信息
def fetch_movies_by_dist():
# 先刷新一下列表
fetch_distributors_list()
url_list = db_tools.query_distributor_hrefs()
if debug:
url_list = db_tools.query_distributor_hrefs(name='vixen.com')
for url in url_list:
logging.info(f"Fetching data for distributor url {url} ...")
while True:
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
if list_data:
all_updated = True
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
if tmp_id:
logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
else:
logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
all_updated = False
# 全部写成功,才算完成,进行下一页
if all_updated:
break
else :
logging.warning(f'parse_page_movie error. url: {url}')
time.sleep(1)
elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
break
else:
logging.warning(f'fetching page error. {url}')
time.sleep(3)
# 调试增加brak
if debug:
break
# 更新distributors列表中的影片信息
def fetch_movies_by_stu():
# 先刷新一下列表
fetch_studios_list()
url_list = db_tools.query_studio_hrefs()
if debug:
url_list = db_tools.query_studio_hrefs(name='vixen.com')
for url in url_list:
logging.info(f"Fetching data for studio url {url} ...")
while True:
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
if list_data:
all_updated = True
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
if tmp_id:
logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
else:
logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
all_updated = False
# 全部写成功,才算完成,进行下一页
if all_updated:
break
else :
logging.warning(f'parse_page_movie error. url: {url}')
time.sleep(1)
elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
break
else:
logging.warning(f'fetching page error. {url}')
time.sleep(3)
# 调试增加brak
if debug:
break
# 更新演员信息,单次循环
def fetch_performers_detail_once(perfomers_list):
last_performer_id = 0
for performer in perfomers_list:
url = performer['href']
person = performer['name']
curr_id = performer['id']
movies_cnt = performer['movies_cnt']
logging.debug(f"Fetching data for performer ({person}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
# 从本地读取的文件,忽略
if skip_local and status_code == scraper.http_code_local :
last_performer_id = curr_id
continue
if soup:
data = scraper.parse_page_performer(soup, url)
if data:
# 检查影片数量是否有更新
page_movies_cnt = int(data.get('movies_cnt', '0'))
movies_changed = True
if page_movies_cnt <= movies_cnt:
movies_changed = False
if not force:
logging.info(f"actor already update. skipping... person: ({person}), url: {url}")
last_performer_id = curr_id
continue
performer_id = db_tools.insert_or_update_performer({
'href': url,
'person': person,
**data
},
movies_update=movies_changed
)
if performer_id:
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
last_performer_id = performer_id
else:
logging.warning(f'insert person: ({person}) {url} failed.')
# 写入到本地json文件
utils.write_person_json(person, url, {
'href': url,
'person': person,
**data
})
else:
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
elif status_code and status_code == scraper.http_code_404:
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_404)
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
elif status_code and status_code == scraper.http_code_url:
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_url)
logging.warning(f'601 page(wrong url). id: {performer_id}, name: {person}, url: {url}, Skiping...')
else:
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
if status_code != 99: # 从网站上获取的数据,需要控制频率
time.sleep(0.5)
return last_performer_id
# 更新演员信息
def fetch_performers_detail():
limit_count = 5 if debug else 100
perfomers_list = []
last_perfomer_id = 0
# 获取新演员的列表
while True:
if force: # 从头逐个遍历
perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count)
else: # 只做更新
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
if len(perfomers_list) < 1:
logging.info(f'all new performers fetched. ')
break
last_perfomer_id = fetch_performers_detail_once(perfomers_list)
logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
if debug:
break
# 更新影片信息
def fetch_movies_detail():
limit_count = 10 if debug else 100
movies_list = []
last_movie_id = 0
while True:
if force: # 从头逐个遍历
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count)
else: # 只做更新
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
if len(movies_list) < 1:
logging.info(f'all movies fetched.')
break
succ_count = 0
for movie in movies_list:
url = movie['href']
title = movie['title']
curr_id = movie['id']
logging.debug(f"Fetching data for movie: {curr_id}: ({title}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
# 从本地读取的文件,忽略
if skip_local and status_code == scraper.http_code_local :
last_movie_id = curr_id
succ_count += 1
continue
if soup:
movie_data = scraper.parse_page_movie(soup, url, title)
if movie_data :
# 修复url不规范的问题
if movie_data['DistributorHref']:
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
if movie_data['StudioHref']:
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
movie_id = db_tools.insert_or_update_movie(movie_data)
if movie_id:
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
last_movie_id = movie_id
succ_count += 1
else:
logging.warning(f'insert movie {url} failed.')
# 写入到本地json文件
utils.write_movie_json(url, movie_data)
else:
logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == scraper.http_code_404:
# 标记为已处理
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
elif status_code and status_code == scraper.http_code_url:
# 标记为已处理
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_url)
logging.warning(f'601 page(wrong url). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else:
logging.warning(f'fetch_page error. url: {url}')
if status_code != 99: # 从网站上获取的数据,需要控制频率
time.sleep(0.5)
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
# 调试增加break
if debug:
return True
def reset_actor_movie_cnt():
db_tools.reset_actor_movies()
def check_task_status():
# 命令行参数处理
result = db_tools.get_statics()
pretty_print_json(result)
# 建立缩写到函数的映射
function_map = {
"astro": fetch_performers_by_astro,
"birth": fetch_performers_by_birth,
"ethnic": fetch_performers_by_ethnic,
"dist" : fetch_movies_by_dist,
"stu" : fetch_movies_by_stu,
"performers": fetch_performers_detail,
"movies" : fetch_movies_detail,
"reset_mv" : reset_actor_movie_cnt,
"check" : check_task_status,
}
# 主函数
def main(cmd, args_debug, args_force, args_skip_local):
global debug
debug = args_debug
if debug:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
global force
force = args_force
global skip_local
skip_local = args_skip_local
if cmd.lower() == 'check':
check_task_status()
return None
# 开启任务
task_id = db_tools.insert_task_log()
if task_id is None:
logging.warning(f'insert task log error.')
return None
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
# 要执行的Shell命令示例
shell_command = "cd ~/projects/resources/src/monitor; chmod u+x ./run.sh; ./run.sh iafd"
# 创建命令调度器30分钟执行一次
scheduler = CommandScheduler(
command=shell_command,
interval=10 if debug else 1800
)
scheduler.run_periodically()
# 执行指定的函数
if cmd:
function_names = args.cmd.split(",") # 拆分输入
for short_name in function_names:
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
if callable(func):
db_tools.update_task_log(task_id, task_status=f'Running {short_name}')
func()
else:
print(f"Warning: {short_name} is not a valid function shortcut.")
else: # 全量执行
for name, func in function_map.items():
if callable(func):
db_tools.update_task_log(task_id, task_status=f'Running {name}')
func()
else:
print(f"Warning: {name} is not a valid function shortcut.")
logging.info(f'all process completed!')
db_tools.finalize_task_log(task_id)
scheduler.stop()
# TODO:
# 1, 演员列表中的影片数量,与电影列表中聚合出来的影片数量,可能不同。一个原因是某个影片有多个导演,且导演又兼了演员。比如:
# https://www.iafd.com/title.rme/id=0f79d81f-25ff-40d1-967a-24b99f03b79a
# https://www.iafd.com/person.rme/id=37efc86d-fefe-436d-8e3e-2e04b4e6565c
# 目前的movie表保存导演信息有遗漏。需要调整
if __name__ == "__main__":
# 命令行参数处理
keys_str = ",".join(function_map.keys())
parser = argparse.ArgumentParser(description='fetch iafd data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)')
args = parser.parse_args()
main(args.cmd, args.debug, args.force, args.skip_local)