636 lines
23 KiB
Python
636 lines
23 KiB
Python
|
||
import cloudscraper
|
||
import time
|
||
import json
|
||
import csv
|
||
import logging
|
||
import signal
|
||
import sys
|
||
import os
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from requests.exceptions import RequestException
|
||
from functools import partial
|
||
#import config
|
||
#import utils
|
||
|
||
# 定义基础 URL 和可变参数
|
||
host_url = "https://www.iafd.com"
|
||
|
||
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||
|
||
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||
|
||
distributors_list_url = f'{host_url}/distrib.asp'
|
||
distributors_base_url = f"{host_url}/distrib.rme/distrib="
|
||
|
||
studios_list_url = f"{host_url}/studio.asp"
|
||
studios_base_url = f"{host_url}/studio.rme/studio="
|
||
|
||
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||
|
||
# 设置 headers 和 scraper
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||
}
|
||
scraper = cloudscraper.create_scraper()
|
||
|
||
http_code_404 = 404
|
||
http_code_login = 401
|
||
http_code_url = 601
|
||
http_code_local = 99
|
||
|
||
save_raw_html = True
|
||
load_from_local = False
|
||
|
||
def common_parser(html, page, **kwargs):
|
||
parser = "lxml" if page=='ethnic' else "html.parser"
|
||
soup = BeautifulSoup(html, parser)
|
||
if not soup:
|
||
return None
|
||
if page == 'astro':
|
||
#parse_page_astro(soup, astro):
|
||
return parse_page_astro(soup, **kwargs)
|
||
elif page == 'birth':
|
||
#parse_page_birth(soup, month, day):
|
||
return parse_page_birth(soup, **kwargs)
|
||
elif page == 'ethnic':
|
||
#parse_page_ethnic(soup, ethnic):
|
||
return parse_page_ethnic(soup, **kwargs)
|
||
elif page == 'dist':
|
||
return parse_page_dist_stu(soup,'distable')
|
||
elif page == 'stu':
|
||
return parse_page_dist_stu(soup,'studio')
|
||
elif page == 'actor':
|
||
#parse_page_performer(soup, url):
|
||
return parse_page_performer(soup, **kwargs)
|
||
elif page == 'movies':
|
||
#parse_page_movie(soup, href, title)
|
||
return parse_page_movie(soup, **kwargs)
|
||
else:
|
||
logging.warning(f"wrong page: {page}")
|
||
return None
|
||
|
||
'''
|
||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||
if load_from_local: # 从本地读取的逻辑
|
||
html = utils.read_raw_html(url)
|
||
if html:
|
||
# 预处理 HTML(如果提供了 preprocessor)
|
||
html_text = preprocessor(html) if preprocessor else html
|
||
|
||
soup = BeautifulSoup(html_text, parser)
|
||
if validator(soup): # 进行自定义页面检查
|
||
return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
if host_url not in url.lower():
|
||
logging.error(f'wrong url format: {url}')
|
||
return None, http_code_url
|
||
|
||
response = scraper.get(url, headers=headers)
|
||
|
||
# 处理 HTTP 状态码
|
||
if response.status_code == 404:
|
||
logging.debug(f"Page not found (404): {url}")
|
||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||
|
||
response.raise_for_status() # 处理 HTTP 错误
|
||
|
||
# 过期的网页,与404相同处理
|
||
if "invalid or outdated page" in response.text.lower():
|
||
logging.debug(f"invalid or outdated page: {url}")
|
||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||
|
||
if save_raw_html:
|
||
utils.write_raw_html(url, response.text)
|
||
|
||
# 预处理 HTML(如果提供了 preprocessor)
|
||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||
|
||
soup = BeautifulSoup(html_text, parser)
|
||
if validator(soup): # 进行自定义页面检查
|
||
return soup, response.status_code
|
||
else:
|
||
# 检查是否发生跳转,比如到登录页面
|
||
if response.history:
|
||
logging.warning(f"Page redirected on {url}. Validation failed.")
|
||
return None, http_code_login
|
||
|
||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||
except Exception as e:
|
||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||
|
||
logging.error(f'Fetching failed after max retries. {url}')
|
||
return None, None # 达到最大重试次数仍然失败
|
||
'''
|
||
|
||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||
def preprocess_html(html):
|
||
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||
|
||
# 通用的 HTML 结构验证器
|
||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||
if attr_type == "id":
|
||
return soup.find(tag, id=identifier) is not None
|
||
elif attr_type == "class":
|
||
return bool(soup.find_all(tag, class_=identifier))
|
||
elif attr_type == "name":
|
||
return bool(soup.find('select', {'name': identifier}))
|
||
return False
|
||
|
||
# 检查电影信息是否存在
|
||
def movie_validator(soup, table_id):
|
||
return soup.find("table", id=table_id) is not None
|
||
|
||
# 解析 HTML 内容,提取需要的数据
|
||
def parse_page_ethnic_list(soup, href):
|
||
div_root = soup.find("select", id="ethnicity1")
|
||
if not div_root:
|
||
logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
|
||
return None, None
|
||
|
||
list_data = []
|
||
|
||
# 提取所有的 <option> 标签
|
||
options = div_root.find_all('option')
|
||
if options:
|
||
# 解析并输出 value 和文本内容
|
||
for option in options:
|
||
href = option.get('value', None)
|
||
text = option.text.strip()
|
||
if href and href.lower() == 'none':
|
||
continue
|
||
list_data.append({
|
||
"name": text,
|
||
"href": host_url + href if href else ''
|
||
})
|
||
return list_data
|
||
|
||
|
||
# 解析 HTML 内容,提取需要的数据
|
||
def parse_page_astro(soup, astro):
|
||
astro_div = soup.find("div", id="astro")
|
||
if not astro_div:
|
||
logging.warning(f"Warning: No 'astro' div found in {astro}")
|
||
return None, None
|
||
|
||
flag = False
|
||
list_cnt = 0
|
||
list_data = []
|
||
next_url = None
|
||
|
||
birth_date = None
|
||
for elem in astro_div.find_all(recursive=False):
|
||
if elem.name == "h3" and "astroday" in elem.get("class", []):
|
||
birth_date = elem.get_text(strip=True)
|
||
elif elem.name == "div" and "perficon" in elem.get("class", []):
|
||
a_tag = elem.find("a")
|
||
if a_tag:
|
||
href = host_url + a_tag["href"]
|
||
name = a_tag.find("span", class_="perfname")
|
||
if name:
|
||
list_data.append({
|
||
"astrology": astro,
|
||
"birth_date": birth_date,
|
||
"person": name.get_text(strip=True),
|
||
"href": href
|
||
})
|
||
flag = True
|
||
list_cnt = list_cnt +1
|
||
if flag:
|
||
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||
return list_data, next_url
|
||
else:
|
||
return None, None
|
||
|
||
|
||
# 解析页面内容并更新birth_map
|
||
def parse_page_birth(soup, month, day):
|
||
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
|
||
if not datarows:
|
||
return None, None
|
||
|
||
flag = False
|
||
list_cnt = 0
|
||
list_data = []
|
||
next_url = None
|
||
rows = datarows[0].find_all('div', class_='col-sm-4')
|
||
for row in rows:
|
||
link_tag = row.find('a')
|
||
person = link_tag.text.strip() if link_tag else ''
|
||
href = link_tag['href'] if link_tag else ''
|
||
href = host_url + href
|
||
|
||
# 如果 href 已经在 birth_map 中,跳过
|
||
flag = True
|
||
if any(entry['href'] == href for entry in list_data):
|
||
continue
|
||
|
||
# 将数据添加到 birth_map
|
||
list_data.append({
|
||
'month': month,
|
||
'day': day,
|
||
'person': person,
|
||
'href': href
|
||
})
|
||
list_cnt = list_cnt +1
|
||
|
||
if flag:
|
||
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||
return list_data, next_url
|
||
else:
|
||
return None, None
|
||
|
||
|
||
# 解析 HTML 内容,提取需要的数据
|
||
def parse_page_ethnic(soup, ethnic):
|
||
rows = soup.find_all('div', class_='row headshotrow')
|
||
flag = False
|
||
list_data = []
|
||
next_url = None
|
||
|
||
for row in rows:
|
||
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
|
||
link_tag = col.find('a')
|
||
img_tag = col.find('div', class_='pictag')
|
||
flag = True
|
||
|
||
if link_tag and img_tag:
|
||
href = host_url + link_tag['href']
|
||
person = img_tag.text.strip()
|
||
|
||
# 将数据存储到 ethnic_map
|
||
list_data.append({
|
||
'ethnic': ethnic,
|
||
'person': person,
|
||
'href': href
|
||
})
|
||
if flag:
|
||
logging.debug(f"get {len(list_data)} persons from this page.")
|
||
|
||
next_page = soup.find('a', rel='next')
|
||
if next_page:
|
||
next_url = host_url + next_page['href']
|
||
logging.debug(f"Found next page: {next_url}")
|
||
return list_data, next_url
|
||
else:
|
||
logging.debug(f"All pages fetched for {ethnic}.")
|
||
return list_data, None
|
||
else:
|
||
return None, None
|
||
|
||
# 解析列表页
|
||
def parse_page_dist_stu_list(soup, select_name):
|
||
list_data = []
|
||
next_url = None
|
||
|
||
select_element = soup.find('select', {'name': select_name})
|
||
if select_element :
|
||
options = select_element.find_all('option')
|
||
for option in options:
|
||
value = option.get('value') # 获取 value 属性
|
||
text = option.text.strip() # 获取文本内容
|
||
list_data.append({
|
||
'name' : text,
|
||
'href' : str(value)
|
||
})
|
||
return list_data, next_url
|
||
else:
|
||
return None, None
|
||
|
||
# 解析 HTML 内容,提取需要的数据
|
||
def parse_page_dist_stu(soup, table_id):
|
||
table = soup.find("table", id=table_id)
|
||
if not table:
|
||
logging.warning(f"Warning: No {table_id} table found ")
|
||
return None, None
|
||
|
||
# 找到thead并跳过
|
||
thead = table.find('thead')
|
||
if thead:
|
||
thead.decompose() # 去掉thead部分,不需要解析
|
||
|
||
# 现在只剩下tbody部分
|
||
tbody = table.find('tbody')
|
||
rows = tbody.find_all('tr') if tbody else []
|
||
|
||
list_data = []
|
||
next_url = None
|
||
for row in rows:
|
||
cols = row.find_all('td')
|
||
if len(cols) >= 5:
|
||
title = cols[0].text.strip()
|
||
label = cols[1].text.strip()
|
||
year = cols[2].text.strip()
|
||
rev = cols[3].text.strip()
|
||
a_href = cols[0].find('a')
|
||
href = host_url + a_href['href'] if a_href else ''
|
||
|
||
list_data.append({
|
||
'title': title,
|
||
'label': label,
|
||
'year': year,
|
||
'rev': rev,
|
||
'href': href
|
||
})
|
||
return list_data, next_url
|
||
|
||
|
||
# 解析 作品列表,有个人出演,也有导演的
|
||
def parse_credits_table(table, distributor_list):
|
||
# 找到thead并跳过
|
||
thead = table.find('thead')
|
||
if thead:
|
||
thead.decompose() # 去掉thead部分,不需要解析
|
||
|
||
# 现在只剩下tbody部分
|
||
tbody = table.find('tbody')
|
||
rows = tbody.find_all('tr') if tbody else []
|
||
|
||
movies = []
|
||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||
|
||
# rows = table.find_all('tr', class_='we')
|
||
for row in rows:
|
||
#tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串
|
||
tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串
|
||
cols = row.find_all('td')
|
||
if len(cols) >= 6:
|
||
title = cols[0].text.strip()
|
||
href_a = cols[0].find('a')
|
||
href = href_a['href'] if href_a else ''
|
||
year = cols[1].text.strip()
|
||
distributor = cols[2].text.strip().lower()
|
||
href_d = cols[2].find('a')
|
||
href_dist = host_url + href_d['href'] if href_d else ''
|
||
notes = cols[3].text.strip()
|
||
rev = cols[4].text.strip()
|
||
formats = cols[5].text.strip()
|
||
|
||
for key in distributor_list:
|
||
if key in distributor:
|
||
distributor_count[key] += 1
|
||
|
||
movies.append({
|
||
'title': title,
|
||
'href' : href,
|
||
'year': year,
|
||
'distributor': distributor,
|
||
'distributor_href': href_dist,
|
||
'notes': notes,
|
||
'rev': rev,
|
||
'formats': formats,
|
||
'tr_class': tr_class
|
||
})
|
||
return movies, distributor_count
|
||
|
||
|
||
# 请求网页并提取所需数据
|
||
def parse_page_performer(soup, url):
|
||
# 提取数据
|
||
data = {}
|
||
|
||
# 定义我们需要的字段名称和HTML中对应的标签
|
||
fields = {
|
||
'performer_aka': 'Performer AKA',
|
||
'birthday': 'Birthday',
|
||
'astrology': 'Astrology',
|
||
'birthplace': 'Birthplace',
|
||
'gender': 'Gender',
|
||
'years_active': 'Years Active',
|
||
'ethnicity': 'Ethnicity',
|
||
'nationality': 'Nationality',
|
||
'hair_colors': 'Hair Colors',
|
||
'eye_color': 'Eye Color',
|
||
'height': 'Height',
|
||
'weight': 'Weight',
|
||
'measurements': 'Measurements',
|
||
'tattoos': 'Tattoos',
|
||
'piercings': 'Piercings'
|
||
}
|
||
reversed_map = {v: k for k, v in fields.items()}
|
||
|
||
# 解析表格数据, 获取参演或者导演的列表
|
||
role_list = ['personal', 'directoral']
|
||
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
|
||
credits_list = {}
|
||
|
||
# 使用字典来存储统计
|
||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||
for role in role_list:
|
||
table = soup.find('table', id=role)
|
||
if table :
|
||
movies, stat_map = parse_credits_table(table, distributor_list)
|
||
credits_list[role] = movies
|
||
# 更新 distributor 统计
|
||
for distributor in distributor_list:
|
||
distributor_count[distributor] += stat_map.get(distributor, 0)
|
||
|
||
# 统计 movies 数量
|
||
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
|
||
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
|
||
|
||
# 如果没有找到
|
||
if len(credits_list) == 0 :
|
||
logging.warning(f"movie table empty. url: {url} ")
|
||
|
||
# 遍历每个 bioheading, 获取metadata
|
||
bioheadings = soup.find_all('p', class_='bioheading')
|
||
for bio in bioheadings:
|
||
heading = bio.text.strip()
|
||
biodata = None
|
||
|
||
# 如果包含 "Performer",需要特殊处理
|
||
if 'Performer' in heading:
|
||
heading = 'Performer AKA'
|
||
biodata_div = bio.find_next('div', class_='biodata')
|
||
if biodata_div:
|
||
div_text = biodata_div.get_text(separator='|').strip()
|
||
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
|
||
else:
|
||
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
|
||
|
||
# 保存数据
|
||
if heading in reversed_map:
|
||
kkey = reversed_map[heading]
|
||
data[kkey] = biodata
|
||
|
||
# 添加统计数据到 data
|
||
data['movies_cnt'] = movies_cnt
|
||
data['vixen_cnt'] = distributor_count['vixen']
|
||
data['blacked_cnt'] = distributor_count['blacked']
|
||
data['tushy_cnt'] = distributor_count['tushy']
|
||
data['x_art_cnt'] = distributor_count['x-art']
|
||
data['credits'] = credits_list
|
||
|
||
return data
|
||
|
||
|
||
|
||
# 解析网页 HTML 并提取电影信息
|
||
def parse_page_movie(soup, href, title):
|
||
# 解析电影基础信息
|
||
movie_data = {}
|
||
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
|
||
if info_div:
|
||
labels = info_div.find_all("p", class_="bioheading")
|
||
values = info_div.find_all("p", class_="biodata")
|
||
for label, value in zip(labels, values):
|
||
key = label.text.strip()
|
||
if key == "Directors": # 解析多位导演的情况
|
||
directors = []
|
||
links = value.find_all("a")
|
||
for link in links:
|
||
director_name = link.text.strip()
|
||
director_href = host_url + link['href'] if link['href'] else ''
|
||
directors.append({"name": director_name, "href": director_href})
|
||
movie_data[key] = directors
|
||
else:
|
||
val = value.text.strip()
|
||
if key in ["Distributor", "Studio", "Director"]:
|
||
link = value.find("a")
|
||
if link:
|
||
val = link.text.strip()
|
||
movie_data[f'{key}Href'] = host_url + link['href']
|
||
movie_data[key] = val
|
||
else:
|
||
return None
|
||
|
||
# 解析演职人员信息
|
||
performers = []
|
||
cast_divs = soup.find_all("div", class_="castbox")
|
||
for cast in cast_divs:
|
||
performer = {}
|
||
link = cast.find("a")
|
||
if link:
|
||
performer["name"] = link.text.strip()
|
||
performer["href"] = host_url + link["href"]
|
||
|
||
#performer["tags"] = [
|
||
# tag.strip() for br in cast.find_all("br")
|
||
# if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
|
||
#]
|
||
|
||
tags = []
|
||
for br in cast.find_all("br"):
|
||
tag = br.next_sibling
|
||
if isinstance(tag, str) and tag.strip():
|
||
tags.append(tag.strip())
|
||
performer["tags"] = tags
|
||
|
||
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
|
||
performers.append(performer)
|
||
|
||
# 解析场景拆解
|
||
scene_breakdowns = []
|
||
scene_table = soup.find("div", id="sceneinfo")
|
||
if scene_table:
|
||
rows = scene_table.find_all("tr")
|
||
|
||
for row in rows:
|
||
cols = row.find_all("td")
|
||
if len(cols) >= 2:
|
||
scene = cols[0].text.strip() # 场景编号
|
||
performer_info = cols[1] # 包含表演者及链接信息
|
||
|
||
# 获取 <br> 之前的完整 HTML(保留 <i> 标签等格式)
|
||
performer_html = str(performer_info) # 获取所有HTML内容
|
||
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
|
||
if split_html:
|
||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||
else:
|
||
split_html = performer_html.split("<br>") # 按 <br> 进行分割
|
||
if split_html:
|
||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||
else:
|
||
performers_html = performer_html.strip() # 如果没有 <br>,取全部
|
||
|
||
# 解析为纯文本(去除HTML标签,仅提取文本内容)
|
||
performers_soup = BeautifulSoup(performers_html, "html.parser")
|
||
performers_text = performers_soup.get_text()
|
||
|
||
# 提取表演者
|
||
scene_performers = [p.strip() for p in performers_text.split(",")]
|
||
|
||
# 尝试获取 `webscene` 和 `studio`
|
||
links_data = {}
|
||
links = performer_info.find_all("a")
|
||
if links:
|
||
webscene_title = links[0].text.strip() if len(links)>0 else None
|
||
webscene = links[0]["href"] if len(links)>0 else None
|
||
studio = links[1].text.strip() if len(links)>1 else None
|
||
studio_lnk = links[1]["href"] if len(links)>1 else None
|
||
links_data = {
|
||
"title": webscene_title,
|
||
"webscene": webscene,
|
||
"studio": studio,
|
||
"studio_lnk": studio_lnk,
|
||
}
|
||
|
||
scene_data = {
|
||
"scene": scene,
|
||
"performers": scene_performers,
|
||
**links_data,
|
||
}
|
||
scene_breakdowns.append(scene_data)
|
||
|
||
appears_in = []
|
||
appears_divs = soup.find("div", id="appearssection")
|
||
if appears_divs:
|
||
rows = appears_divs.find_all("li")
|
||
for row in rows:
|
||
lnk = row.find("a")
|
||
if lnk:
|
||
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
|
||
|
||
|
||
return {
|
||
"href": href,
|
||
"title": title,
|
||
"Minutes": movie_data.get("Minutes", ""),
|
||
"Distributor": movie_data.get("Distributor", ""),
|
||
"Studio": movie_data.get("Studio", ""),
|
||
"ReleaseDate": movie_data.get("Release Date", ""),
|
||
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||
"All-Girl": movie_data.get("All-Girl", ""),
|
||
"All-Male": movie_data.get("All-Male", ""),
|
||
"Compilation": movie_data.get("Compilation", ""),
|
||
"Webscene": movie_data.get("Webscene", ""),
|
||
"Director": movie_data.get("Director", ""),
|
||
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||
"StudioHref": movie_data.get("StudioHref", ""),
|
||
"Directors": movie_data.get("Directors", []), # 可能存在的元素
|
||
"Performers": performers,
|
||
"SceneBreakdowns": scene_breakdowns,
|
||
"AppearsIn": appears_in,
|
||
}
|
||
|
||
|
||
if __name__ == "__main__":
|
||
|
||
for astro in astro_list:
|
||
url = astr_base_url + astro
|
||
next_url = url
|
||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||
|
||
while True:
|
||
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||
if soup:
|
||
list_data, next_url = parse_page_astro(soup, astro)
|
||
if list_data:
|
||
print(list_data[0] if len(list_data)>0 else 'no data')
|
||
break
|
||
else:
|
||
logging.info(f"Retrying {next_url} ...")
|
||
time.sleep(5) # 等待后再重试
|
||
|
||
time.sleep(2) # 控制访问频率 |