add files
This commit is contained in:
@ -113,11 +113,15 @@ def format_ethnic(ethnic):
|
|||||||
|
|
||||||
# 主逻辑函数:循环处理每个种族
|
# 主逻辑函数:循环处理每个种族
|
||||||
def process_ethnic_data():
|
def process_ethnic_data():
|
||||||
|
all_person = len(ethnic_map) # 应该为0
|
||||||
|
all_pages = 0
|
||||||
|
|
||||||
for ethnic in ethnic_list:
|
for ethnic in ethnic_list:
|
||||||
url = base_url + format_ethnic(ethnic)
|
url = base_url + format_ethnic(ethnic)
|
||||||
next_url = url
|
next_url = url
|
||||||
cursor = int(len(ethnic_map) / 100)
|
cursor = int(all_person / 100)
|
||||||
logging.info(f"Fetching data for {ethnic}, url {url} ...")
|
pages = 0
|
||||||
|
logging.info(f"--------Fetching data for {ethnic}, url {url} ...")
|
||||||
|
|
||||||
while next_url:
|
while next_url:
|
||||||
html = fetch_page(next_url)
|
html = fetch_page(next_url)
|
||||||
@ -125,9 +129,10 @@ def process_ethnic_data():
|
|||||||
soup = parse_page(html, ethnic)
|
soup = parse_page(html, ethnic)
|
||||||
if soup:
|
if soup:
|
||||||
next_url = handle_pagination(soup, ethnic)
|
next_url = handle_pagination(soup, ethnic)
|
||||||
|
pages = pages + 1
|
||||||
else:
|
else:
|
||||||
logging.info(f"wrong html content. retring {next_url} ...")
|
logging.info(f"wrong html content. retring {next_url} ...")
|
||||||
# 定期保存结果
|
# 统计,并定期保存结果
|
||||||
if len(ethnic_map) / 100 > cursor:
|
if len(ethnic_map) / 100 > cursor:
|
||||||
cursor = int(len(ethnic_map) / 100)
|
cursor = int(len(ethnic_map) / 100)
|
||||||
save_data()
|
save_data()
|
||||||
@ -135,6 +140,14 @@ def process_ethnic_data():
|
|||||||
else:
|
else:
|
||||||
logging.info(f"Retrying {next_url} ...")
|
logging.info(f"Retrying {next_url} ...")
|
||||||
time.sleep(5) # 等待后再重试
|
time.sleep(5) # 等待后再重试
|
||||||
|
# 统计输出
|
||||||
|
ethnic_person = len(ethnic_map) - all_person
|
||||||
|
all_person = len(ethnic_map)
|
||||||
|
all_pages = all_pages + pages
|
||||||
|
logging.info(f"--------Fetching data for {ethnic} end. total pages: {pages}, total persons: {ethnic_person}, all persons fetched: {all_person}")
|
||||||
|
# 统计最后结果
|
||||||
|
logging.info(f"--------Fetching all data end. total ethnic: {len(ethnic_list)}, total pages: {all_pages}, total persons: {all_person}")
|
||||||
|
|
||||||
|
|
||||||
# 保存到文件
|
# 保存到文件
|
||||||
def save_data():
|
def save_data():
|
||||||
|
|||||||
@ -7,3 +7,12 @@ Dream Doll,https://www.iafd.com/person.rme/id=481fd985-9740-4c1b-b794-4620d6528c
|
|||||||
Gianna,https://www.iafd.com/person.rme/id=8e22a8e0-e28e-4767-ad74-cae7a1ef6f15,No known aliases,01/01/????,Capricorn,No data,Trans woman,2006-2009,Caucasian,No data,,Unknown,No data,No data,No data,No data,No data
|
Gianna,https://www.iafd.com/person.rme/id=8e22a8e0-e28e-4767-ad74-cae7a1ef6f15,No known aliases,01/01/????,Capricorn,No data,Trans woman,2006-2009,Caucasian,No data,,Unknown,No data,No data,No data,No data,No data
|
||||||
Kirsten,https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34,Kirsten Claudia|Trish,01/01/????,Capricorn,No data,Trans woman,,Latin,No data,Black/Brown/Light Brown,Unknown,No data,No data,36DD-24-35,No data,No data
|
Kirsten,https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34,Kirsten Claudia|Trish,01/01/????,Capricorn,No data,Trans woman,,Latin,No data,Black/Brown/Light Brown,Unknown,No data,No data,36DD-24-35,No data,No data
|
||||||
Maya,https://www.iafd.com/person.rme/id=fae84552-50cf-494b-b34f-a22e1a669fd9,No known aliases,01/01/????,Capricorn,No data,Trans woman,,Latin/Multi-ethnic,No data,Black/Dark Brown,Unknown,"5 feet, 5 inches (165 cm)",125 lbs (57 kg),No data,No data,No data
|
Maya,https://www.iafd.com/person.rme/id=fae84552-50cf-494b-b34f-a22e1a669fd9,No known aliases,01/01/????,Capricorn,No data,Trans woman,,Latin/Multi-ethnic,No data,Black/Dark Brown,Unknown,"5 feet, 5 inches (165 cm)",125 lbs (57 kg),No data,No data,No data
|
||||||
|
Paris,https://www.iafd.com/person.rme/id=79724d4a-dbd4-4e15-9d90-a49ec2a6bde1,No known aliases,01/01/????,Capricorn,US,Trans woman,,Black,American,,Unknown,No data,No data,No data,No data,No data
|
||||||
|
Shaniah,https://www.iafd.com/person.rme/id=fcb9e75d-3086-4950-a48a-599ae053c8cf,No known aliases,01/01/????,Capricorn,US,Trans woman,,Black,American,,Unknown,No data,No data,No data,No data,Navel
|
||||||
|
Tosha,https://www.iafd.com/person.rme/id=37b08999-7ba0-460b-9a26-f6be57406ded,No known aliases,01/01/????,Capricorn,Maryland,Trans woman,,Black,American,,Unknown,No data,No data,No data,No data,No data
|
||||||
|
Viktoria,https://www.iafd.com/person.rme/id=e585c62f-8df6-4312-a023-9563e6b559e5,No known aliases,01/01/????,Capricorn,Russia,Trans woman,,Caucasian,Russian,,Unknown,No data,No data,No data,No data,No data
|
||||||
|
Maryanne Fisher,https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586,No known aliases,01/01/19??,Capricorn,No data,Woman,,Caucasian,No data,,Unknown,No data,No data,No data,No data,No data
|
||||||
|
Claude Marcault,https://www.iafd.com/person.rme/id=19971b82-648b-4795-9fd5-a0f7c3ed867f,No known aliases,"January 1, 1941 (would be 84 years old)",Capricorn,India,Woman,,Caucasian,French,,Unknown,No data,No data,No data,No data,No data
|
||||||
|
Fernando Arcangeli,https://www.iafd.com/person.rme/id=b32894f5-d15f-40fa-8048-daa6cfa6e480,Ferdinando Arcangeli|Mimì Losy,"January 1, 1942 (83 years old)",Capricorn,"Province of Rome, Italy",Man,1976-1983 (Started around 34 years old),Caucasian,Italian,,Unknown,No data,No data,,No data,No data
|
||||||
|
Doris Arden,https://www.iafd.com/person.rme/id=585e3afc-9525-48cd-a1d7-9895049c3948,No known aliases,"January 1, 1946 (79 years old)",Capricorn,"Trostberg, Bavaria, Germany",Woman,,Caucasian,German,,Hazel,No data,No data,No data,No data,No data
|
||||||
|
Ennio Pontis,https://www.iafd.com/person.rme/id=b2aa39b4-644d-4f61-b088-2ff80ed344b3,No known aliases,"January 1, 1951 (74 years old)",Capricorn,"Rome, Italy",Man,,Caucasian,Italian,,Unknown,No data,No data,,No data,No data
|
||||||
|
|||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user