modify scripts
This commit is contained in:
@ -518,6 +518,44 @@ def parse_maker_detail(soup, href):
|
|||||||
return list_data, next_url
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_uncensored(soup, href):
|
||||||
|
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return [], None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href' : host_url + link if link else '',
|
||||||
|
'serial_number' : serial_number,
|
||||||
|
'title' : title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = url_page_num(next_page_url)
|
||||||
|
current_page_number = url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number :
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
###### 以下为测试代码 ######
|
###### 以下为测试代码 ######
|
||||||
def test_actors_list():
|
def test_actors_list():
|
||||||
@ -591,12 +629,25 @@ def test_series_detail():
|
|||||||
print(all_data)
|
print(all_data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_uncensored():
|
||||||
|
next_url = 'https://javdb.com/search?from_recent=1&q=%E6%97%A0%E7%A0%81%E6%B5%81%E5%87%BA'
|
||||||
|
all_data = []
|
||||||
|
while next_url:
|
||||||
|
print(f'fetching page {next_url}')
|
||||||
|
soup, status_code = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-8", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = parse_uncensored(soup, next_url)
|
||||||
|
if list_data :
|
||||||
|
all_data.extend(list_data)
|
||||||
|
else:
|
||||||
|
print('get wrong page.')
|
||||||
|
utils.json_to_csv(all_data, 'uncensored.csv')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
#test_actors_list()
|
#test_actors_list()
|
||||||
#test_actor()
|
#test_actor()
|
||||||
test_movie_detail()
|
#test_movie_detail()
|
||||||
#test_series_list()
|
#test_series_list()
|
||||||
#test_series_detail()
|
#test_series_detail()
|
||||||
|
test_uncensored()
|
||||||
|
|
||||||
@ -106,3 +106,13 @@ def remove_url_query(url: str) -> str:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"解析 URL 失败: {e}")
|
print(f"解析 URL 失败: {e}")
|
||||||
return url
|
return url
|
||||||
|
# 写csv文件
|
||||||
|
def json_to_csv(data, output_file):
|
||||||
|
if not data:
|
||||||
|
return
|
||||||
|
headers = list(data[0].keys())
|
||||||
|
with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=headers)
|
||||||
|
writer.writeheader()
|
||||||
|
for row in data:
|
||||||
|
writer.writerow(row)
|
||||||
|
|||||||
Reference in New Issue
Block a user