add some scripts.

This commit is contained in:
2025-02-28 11:34:26 +08:00
parent f482a3353f
commit 3c14ce8cf2
6 changed files with 6644 additions and 2117 deletions

View File

@ -63,6 +63,46 @@ def load_existing_hrefs():
logging.info("detail.json not found, starting fresh.")
return existing_hrefs
# 解析 作品列表,有个人出演,也有导演的
def parse_credits_table(table, distributor_list):
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
movies = []
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
# rows = table.find_all('tr', class_='we')
for row in rows:
cols = row.find_all('td')
if len(cols) >= 6:
title = cols[0].text.strip()
year = cols[1].text.strip()
distributor = cols[2].text.strip().lower()
notes = cols[3].text.strip()
rev = cols[4].text.strip()
formats = cols[5].text.strip()
for key in distributor_list:
if key in distributor:
distributor_count[key] += 1
movies.append({
'title': title,
'year': year,
'distributor': distributor,
'notes': notes,
'rev': rev,
'formats': formats
})
return movies, distributor_count
# 请求网页并提取所需数据
def fetch_and_parse_page(url, scraper):
try:
@ -96,61 +136,31 @@ def fetch_and_parse_page(url, scraper):
}
reversed_map = {v: k for k, v in fields.items()}
# 解析表格数据
movies = []
vixen_cnt = 0
blacked_cnt = 0
tushy_cnt = 0
x_art_cnt = 0
role = 'personal'
table = soup.find('table', id='personal')
if table is None:
table = soup.find('table', id='directoral')
role = 'directoral'
if table:
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 解析表格数据, 获取参演或者导演的列表
role_list = ['personal', 'directoral']
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
credits_list = {}
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
# 使用字典来存储统计
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
for role in role_list:
table = soup.find('table', id=role)
if table :
movies, stat_map = parse_credits_table(table, distributor_list)
credits_list[role] = movies
# 更新 distributor 统计
for distributor in distributor_list:
distributor_count[distributor] += stat_map.get(distributor, 0)
# rows = table.find_all('tr', class_='we')
for row in rows:
cols = row.find_all('td')
if len(cols) >= 6:
title = cols[0].text.strip()
year = cols[1].text.strip()
distributor = cols[2].text.strip().lower()
notes = cols[3].text.strip()
rev = cols[4].text.strip()
formats = cols[5].text.strip()
# 统计 movies 数量
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
# 统计 distributor 中的关键词
if 'vixen' in distributor:
vixen_cnt += 1
if 'blacked' in distributor:
blacked_cnt += 1
if 'tushy' in distributor:
tushy_cnt += 1
if 'x_art' in distributor:
x_art_cnt += 1
# 如果没有找到
if len(credits_list) == 0 :
logging.warning(f"movie table empty. url: {url} ")
movies.append({
'title': title,
'year': year,
'distributor': distributor,
'notes': notes,
'rev': rev,
'formats': formats
})
else:
logging.warning(f"movie table empty. ")
# 遍历每个 bioheading
# 遍历每个 bioheading, 获取metadata
bioheadings = soup.find_all('p', class_='bioheading')
for bio in bioheadings:
heading = bio.text.strip()
@ -172,13 +182,13 @@ def fetch_and_parse_page(url, scraper):
data[kkey] = biodata
# 添加统计数据到 data
data['movies_cnt'] = len(movies)
data['vixen_cnt'] = vixen_cnt
data['blacked_cnt'] = blacked_cnt
data['tushy_cnt'] = tushy_cnt
data['x_art_cnt'] = x_art_cnt
data['movies_cnt'] = movies_cnt
data['vixen_cnt'] = distributor_count['vixen']
data['blacked_cnt'] = distributor_count['blacked']
data['tushy_cnt'] = distributor_count['tushy']
data['x_art_cnt'] = distributor_count['x-art']
return data, {'role': role, 'movies' : movies}
return data, credits_list
except RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None, None
@ -270,7 +280,35 @@ def write_person_json(person, href, data):
logging.error(f"Error writing file {full_path}: {e}")
def main():
# 指定url访问
def process_one(href):
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 获取并解析数据
while True:
data, movies = fetch_and_parse_page(href, scraper)
if data is None:
logging.warning(f'Retring {href} ')
time.sleep(3)
else:
break
# 写入 performer 的独立 JSON 文件
full_data = {
**data,
'credits': movies if movies else {}
}
person_id = extract_id_from_href(href)
person_filename = f"{person_id}.json" # 用 - 替换空格
try:
with open(person_filename, 'w', encoding='utf-8') as json_file:
json.dump(full_data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {person_filename}: {e}")
print(f'fetch succ. saved result in {person_filename}')
def process_all():
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
@ -297,7 +335,7 @@ def main():
# 获取并解析数据
while True:
data, movies = fetch_and_parse_page(href, scraper)
data, credits = fetch_and_parse_page(href, scraper)
if data is None:
logging.warning(f'Retring {href} - {person} ')
time.sleep(3)
@ -316,7 +354,7 @@ def main():
'href': href,
'person': person,
**data,
'credits': movies if movies else {}
'credits': credits if credits else {}
}
write_person_json(person.strip(), href, full_data)
@ -333,14 +371,23 @@ def main():
# 延时,防止请求过快被封锁
time.sleep(1)
if __name__ == "__main__":
# 全量访问
def main():
try:
# 注册退出信号
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
main()
process_all()
finally:
# 清理操作,保证在程序正常退出时执行
write_to_csv(final_data) # Write to CSV or other necessary tasks
write_to_detail_json(final_data) # Save data to JSON
logging.info("Data processing completed.")
logging.info("Data processing completed.")
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
process_one(url)
else:
main()