add some scripts.
This commit is contained in:
@ -63,6 +63,46 @@ def load_existing_hrefs():
|
||||
logging.info("detail.json not found, starting fresh.")
|
||||
return existing_hrefs
|
||||
|
||||
# 解析 作品列表,有个人出演,也有导演的
|
||||
def parse_credits_table(table, distributor_list):
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
movies = []
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
|
||||
# rows = table.find_all('tr', class_='we')
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 6:
|
||||
title = cols[0].text.strip()
|
||||
year = cols[1].text.strip()
|
||||
distributor = cols[2].text.strip().lower()
|
||||
notes = cols[3].text.strip()
|
||||
rev = cols[4].text.strip()
|
||||
formats = cols[5].text.strip()
|
||||
|
||||
for key in distributor_list:
|
||||
if key in distributor:
|
||||
distributor_count[key] += 1
|
||||
|
||||
movies.append({
|
||||
'title': title,
|
||||
'year': year,
|
||||
'distributor': distributor,
|
||||
'notes': notes,
|
||||
'rev': rev,
|
||||
'formats': formats
|
||||
})
|
||||
return movies, distributor_count
|
||||
|
||||
|
||||
# 请求网页并提取所需数据
|
||||
def fetch_and_parse_page(url, scraper):
|
||||
try:
|
||||
@ -96,61 +136,31 @@ def fetch_and_parse_page(url, scraper):
|
||||
}
|
||||
reversed_map = {v: k for k, v in fields.items()}
|
||||
|
||||
# 解析表格数据
|
||||
movies = []
|
||||
vixen_cnt = 0
|
||||
blacked_cnt = 0
|
||||
tushy_cnt = 0
|
||||
x_art_cnt = 0
|
||||
role = 'personal'
|
||||
table = soup.find('table', id='personal')
|
||||
if table is None:
|
||||
table = soup.find('table', id='directoral')
|
||||
role = 'directoral'
|
||||
if table:
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
# 解析表格数据, 获取参演或者导演的列表
|
||||
role_list = ['personal', 'directoral']
|
||||
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
|
||||
credits_list = {}
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
# 使用字典来存储统计
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
for role in role_list:
|
||||
table = soup.find('table', id=role)
|
||||
if table :
|
||||
movies, stat_map = parse_credits_table(table, distributor_list)
|
||||
credits_list[role] = movies
|
||||
# 更新 distributor 统计
|
||||
for distributor in distributor_list:
|
||||
distributor_count[distributor] += stat_map.get(distributor, 0)
|
||||
|
||||
# rows = table.find_all('tr', class_='we')
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 6:
|
||||
title = cols[0].text.strip()
|
||||
year = cols[1].text.strip()
|
||||
distributor = cols[2].text.strip().lower()
|
||||
notes = cols[3].text.strip()
|
||||
rev = cols[4].text.strip()
|
||||
formats = cols[5].text.strip()
|
||||
# 统计 movies 数量
|
||||
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
|
||||
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
|
||||
|
||||
# 统计 distributor 中的关键词
|
||||
if 'vixen' in distributor:
|
||||
vixen_cnt += 1
|
||||
if 'blacked' in distributor:
|
||||
blacked_cnt += 1
|
||||
if 'tushy' in distributor:
|
||||
tushy_cnt += 1
|
||||
if 'x_art' in distributor:
|
||||
x_art_cnt += 1
|
||||
# 如果没有找到
|
||||
if len(credits_list) == 0 :
|
||||
logging.warning(f"movie table empty. url: {url} ")
|
||||
|
||||
movies.append({
|
||||
'title': title,
|
||||
'year': year,
|
||||
'distributor': distributor,
|
||||
'notes': notes,
|
||||
'rev': rev,
|
||||
'formats': formats
|
||||
})
|
||||
else:
|
||||
logging.warning(f"movie table empty. ")
|
||||
|
||||
|
||||
# 遍历每个 bioheading
|
||||
# 遍历每个 bioheading, 获取metadata
|
||||
bioheadings = soup.find_all('p', class_='bioheading')
|
||||
for bio in bioheadings:
|
||||
heading = bio.text.strip()
|
||||
@ -172,13 +182,13 @@ def fetch_and_parse_page(url, scraper):
|
||||
data[kkey] = biodata
|
||||
|
||||
# 添加统计数据到 data
|
||||
data['movies_cnt'] = len(movies)
|
||||
data['vixen_cnt'] = vixen_cnt
|
||||
data['blacked_cnt'] = blacked_cnt
|
||||
data['tushy_cnt'] = tushy_cnt
|
||||
data['x_art_cnt'] = x_art_cnt
|
||||
data['movies_cnt'] = movies_cnt
|
||||
data['vixen_cnt'] = distributor_count['vixen']
|
||||
data['blacked_cnt'] = distributor_count['blacked']
|
||||
data['tushy_cnt'] = distributor_count['tushy']
|
||||
data['x_art_cnt'] = distributor_count['x-art']
|
||||
|
||||
return data, {'role': role, 'movies' : movies}
|
||||
return data, credits_list
|
||||
except RequestException as e:
|
||||
logging.error(f"Error fetching {url}: {e}")
|
||||
return None, None
|
||||
@ -270,7 +280,35 @@ def write_person_json(person, href, data):
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
# 指定url访问
|
||||
def process_one(href):
|
||||
# 初始化 cloudscraper
|
||||
scraper = cloudscraper.create_scraper()
|
||||
# 获取并解析数据
|
||||
while True:
|
||||
data, movies = fetch_and_parse_page(href, scraper)
|
||||
if data is None:
|
||||
logging.warning(f'Retring {href} ')
|
||||
time.sleep(3)
|
||||
else:
|
||||
break
|
||||
|
||||
# 写入 performer 的独立 JSON 文件
|
||||
full_data = {
|
||||
**data,
|
||||
'credits': movies if movies else {}
|
||||
}
|
||||
person_id = extract_id_from_href(href)
|
||||
person_filename = f"{person_id}.json" # 用 - 替换空格
|
||||
|
||||
try:
|
||||
with open(person_filename, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(full_data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {person_filename}: {e}")
|
||||
print(f'fetch succ. saved result in {person_filename}')
|
||||
|
||||
def process_all():
|
||||
# 初始化 cloudscraper
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
@ -297,7 +335,7 @@ def main():
|
||||
|
||||
# 获取并解析数据
|
||||
while True:
|
||||
data, movies = fetch_and_parse_page(href, scraper)
|
||||
data, credits = fetch_and_parse_page(href, scraper)
|
||||
if data is None:
|
||||
logging.warning(f'Retring {href} - {person} ')
|
||||
time.sleep(3)
|
||||
@ -316,7 +354,7 @@ def main():
|
||||
'href': href,
|
||||
'person': person,
|
||||
**data,
|
||||
'credits': movies if movies else {}
|
||||
'credits': credits if credits else {}
|
||||
}
|
||||
write_person_json(person.strip(), href, full_data)
|
||||
|
||||
@ -333,14 +371,23 @@ def main():
|
||||
# 延时,防止请求过快被封锁
|
||||
time.sleep(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 全量访问
|
||||
def main():
|
||||
try:
|
||||
# 注册退出信号
|
||||
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
|
||||
main()
|
||||
process_all()
|
||||
finally:
|
||||
# 清理操作,保证在程序正常退出时执行
|
||||
write_to_csv(final_data) # Write to CSV or other necessary tasks
|
||||
write_to_detail_json(final_data) # Save data to JSON
|
||||
logging.info("Data processing completed.")
|
||||
logging.info("Data processing completed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
url = sys.argv[1]
|
||||
process_one(url)
|
||||
else:
|
||||
main()
|
||||
Reference in New Issue
Block a user