add some scripts.
This commit is contained in:
@ -31,6 +31,8 @@ import csv
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import RequestException
|
||||
import config
|
||||
@ -43,17 +45,19 @@ res_dir = './result'
|
||||
res_json_file = f'{res_dir}/detail.json'
|
||||
res_csv_file = f'{res_dir}/detail.csv'
|
||||
input_json_file = f'{res_dir}/merged.json'
|
||||
performers_dir = f'{res_dir}/performers'
|
||||
|
||||
# 存储结果
|
||||
final_data = []
|
||||
|
||||
# 读取 detail.json 中的 href
|
||||
# 读取 detail.json 中的 数据,以便于断点续传
|
||||
def load_existing_hrefs():
|
||||
existing_hrefs = set()
|
||||
global final_data
|
||||
try:
|
||||
with open(res_json_file, 'r') as file:
|
||||
data = json.load(file)
|
||||
for entry in data:
|
||||
final_data = json.load(file)
|
||||
for entry in final_data:
|
||||
existing_hrefs.add(entry['href'])
|
||||
except FileNotFoundError:
|
||||
logging.info("detail.json not found, starting fresh.")
|
||||
@ -65,7 +69,7 @@ def fetch_and_parse_page(url, scraper):
|
||||
response = scraper.get(url)
|
||||
if response.status_code != 200:
|
||||
logging.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
|
||||
return None
|
||||
return None, None
|
||||
|
||||
# 解析 HTML 内容
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
@ -92,6 +96,47 @@ def fetch_and_parse_page(url, scraper):
|
||||
}
|
||||
reversed_map = {v: k for k, v in fields.items()}
|
||||
|
||||
# 解析表格数据
|
||||
movies = []
|
||||
vixen_cnt = 0
|
||||
blacked_cnt = 0
|
||||
tushy_cnt = 0
|
||||
x_art_cnt = 0
|
||||
table = soup.find('table', id='personal')
|
||||
if table:
|
||||
rows = table.find_all('tr', class_='we')
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 6:
|
||||
title = cols[0].text.strip()
|
||||
year = cols[1].text.strip()
|
||||
distributor = cols[2].text.strip().lower()
|
||||
notes = cols[3].text.strip()
|
||||
rev = cols[4].text.strip()
|
||||
formats = cols[5].text.strip()
|
||||
|
||||
# 统计 distributor 中的关键词
|
||||
if 'vixen' in distributor:
|
||||
vixen_cnt += 1
|
||||
if 'blacked' in distributor:
|
||||
blacked_cnt += 1
|
||||
if 'tushy' in distributor:
|
||||
tushy_cnt += 1
|
||||
if 'x_art' in distributor:
|
||||
x_art_cnt += 1
|
||||
|
||||
movies.append({
|
||||
'title': title,
|
||||
'year': year,
|
||||
'distributor': distributor,
|
||||
'notes': notes,
|
||||
'rev': rev,
|
||||
'formats': formats
|
||||
})
|
||||
else:
|
||||
logging.warning(f"movie table empty. ")
|
||||
|
||||
|
||||
# 遍历每个 bioheading
|
||||
bioheadings = soup.find_all('p', class_='bioheading')
|
||||
for bio in bioheadings:
|
||||
@ -112,11 +157,18 @@ def fetch_and_parse_page(url, scraper):
|
||||
if heading in reversed_map:
|
||||
kkey = reversed_map[heading]
|
||||
data[kkey] = biodata
|
||||
|
||||
# 添加统计数据到 data
|
||||
data['movies_cnt'] = len(movies)
|
||||
data['vixen_cnt'] = vixen_cnt
|
||||
data['blacked_cnt'] = blacked_cnt
|
||||
data['tushy_cnt'] = tushy_cnt
|
||||
data['x_art_cnt'] = x_art_cnt
|
||||
|
||||
return data
|
||||
return data, movies
|
||||
except RequestException as e:
|
||||
logging.error(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
return None, None
|
||||
|
||||
# 写入 detail.json
|
||||
def write_to_detail_json(data):
|
||||
@ -128,13 +180,24 @@ def write_to_csv(data):
|
||||
try:
|
||||
with open(res_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.writer(csvfile, delimiter=',')
|
||||
header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity', 'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings']
|
||||
header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity',
|
||||
'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings',
|
||||
'movies_cnt', 'vixen_cnt', 'blacked_cnt', 'tushy_cnt', 'x_art_cnt']
|
||||
writer.writerow(header)
|
||||
for entry in data:
|
||||
# 确保 performer_aka 始终为列表类型
|
||||
performer_aka = entry.get('performer_aka', [])
|
||||
|
||||
# 如果是 None 或非列表类型,转换为一个空列表
|
||||
if performer_aka is None:
|
||||
performer_aka = []
|
||||
elif not isinstance(performer_aka, list):
|
||||
performer_aka = [performer_aka]
|
||||
|
||||
writer.writerow([
|
||||
entry.get('person', ''),
|
||||
entry.get('href', ''),
|
||||
'|'.join(entry.get('performer_aka', [])),
|
||||
'|'.join(performer_aka),
|
||||
entry.get('birthday', ''),
|
||||
entry.get('astrology', ''),
|
||||
entry.get('birthplace', ''),
|
||||
@ -148,7 +211,12 @@ def write_to_csv(data):
|
||||
entry.get('weight', ''),
|
||||
entry.get('measurements', ''),
|
||||
entry.get('tattoos', ''),
|
||||
entry.get('piercings', '')
|
||||
entry.get('piercings', ''),
|
||||
entry.get('movies_cnt', 0),
|
||||
entry.get('vixen_cnt', 0),
|
||||
entry.get('blacked_cnt', 0),
|
||||
entry.get('tushy_cnt', 0),
|
||||
entry.get('x_art_cnt', 0)
|
||||
])
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing to CSV: {e}")
|
||||
@ -159,21 +227,49 @@ def handle_exit_signal(signal, frame):
|
||||
write_to_detail_json(final_data)
|
||||
sys.exit(0)
|
||||
|
||||
# 创建目录
|
||||
def create_directory_for_person(person):
|
||||
# 获取 person 的前两个字母并转为小写
|
||||
person_dir = person[:1].lower()
|
||||
full_path = os.path.join(performers_dir, person_dir)
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
return full_path
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_person_json(person, href, data):
|
||||
# 获取目录
|
||||
person_dir = create_directory_for_person(person)
|
||||
person_id = extract_id_from_href(href)
|
||||
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
# 初始化 cloudscraper
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# 加载已存在的 href 列表
|
||||
global final_data
|
||||
existing_hrefs = load_existing_hrefs()
|
||||
logging.info(f"load data from {res_json_file}, count: {len(final_data)}")
|
||||
|
||||
# 读取 merged.json
|
||||
with open(input_json_file, 'r') as file:
|
||||
merged_data = json.load(file)
|
||||
|
||||
# 注册退出信号
|
||||
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
|
||||
|
||||
# 遍历 merged.json 中的数据
|
||||
loop = 0
|
||||
for entry in merged_data:
|
||||
@ -187,30 +283,51 @@ def main():
|
||||
logging.info(f"Processing {href} - {person}")
|
||||
|
||||
# 获取并解析数据
|
||||
data = fetch_and_parse_page(href, scraper)
|
||||
if data:
|
||||
# 如果数据正确,加入到 final_data
|
||||
final_data.append({
|
||||
'href': href,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
loop = loop+1
|
||||
if loop % 100 == 0:
|
||||
# 更新 detail.json 文件
|
||||
print(f'flush data to json file. now data count: {loop}')
|
||||
write_to_detail_json(final_data)
|
||||
|
||||
# 更新已存在的 href
|
||||
existing_hrefs.add(href)
|
||||
while True:
|
||||
data, movies = fetch_and_parse_page(href, scraper)
|
||||
if data is None:
|
||||
logging.warning(f'Retring {href} - {person} ')
|
||||
time.sleep(3)
|
||||
else:
|
||||
break
|
||||
|
||||
# 如果数据正确,加入到 final_data
|
||||
final_data.append({
|
||||
'href': href,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
|
||||
# 写入 performer 的独立 JSON 文件
|
||||
full_data = {
|
||||
'href': href,
|
||||
'person': person,
|
||||
**data,
|
||||
'movies': movies if movies else []
|
||||
}
|
||||
write_person_json(person.strip(), href, full_data)
|
||||
|
||||
# 更新 detail.json 文件
|
||||
loop = loop + 1
|
||||
if loop % 100 == 0:
|
||||
logging.info(f'flush data to json file. now fetched data count: {loop}, total count: {len(final_data)}')
|
||||
write_to_detail_json(final_data)
|
||||
write_to_csv(final_data)
|
||||
|
||||
# 更新已存在的 href
|
||||
existing_hrefs.add(href)
|
||||
|
||||
# 延时,防止请求过快被封锁
|
||||
time.sleep(1)
|
||||
|
||||
# 完成后一次性写入 CSV
|
||||
write_to_csv(final_data)
|
||||
|
||||
logging.info("Data processing completed.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
try:
|
||||
# 注册退出信号
|
||||
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
|
||||
main()
|
||||
finally:
|
||||
# 清理操作,保证在程序正常退出时执行
|
||||
write_to_csv(final_data) # Write to CSV or other necessary tasks
|
||||
write_to_detail_json(final_data) # Save data to JSON
|
||||
logging.info("Data processing completed.")
|
||||
Reference in New Issue
Block a user