From 16fe4ec54e8cf609449967c19568815a92306c53 Mon Sep 17 00:00:00 2001 From: oscarz Date: Wed, 23 Apr 2025 19:18:06 +0800 Subject: [PATCH] modify scripts --- javdb/src/scraper.py | 55 ++++++++++++++++++++++++++++++++++++++++++-- javdb/src/utils.py | 10 ++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/javdb/src/scraper.py b/javdb/src/scraper.py index f503038..84867fb 100644 --- a/javdb/src/scraper.py +++ b/javdb/src/scraper.py @@ -518,6 +518,44 @@ def parse_maker_detail(soup, href): return list_data, next_url +# 解析 HTML 内容,提取需要的数据 +def parse_uncensored(soup, href): + div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8') + if not div_movies: + logging.warning(f"Warning: No movies div found ") + return [], None + + # 解析元素 + rows = div_movies.find_all('div', class_='item') + + list_data = [] + next_url = None + for row in rows: + link = row.find('a', class_='box')['href'] + serial_number = row.find('strong').text.strip() + title = row.find('div', class_='video-title').text.strip() + release_date = row.find('div', class_='meta').text.strip() + list_data.append({ + 'href' : host_url + link if link else '', + 'serial_number' : serial_number, + 'title' : title, + 'release_date': release_date + }) + + # 查找 "下一页" 按钮 + next_page_element = soup.find('a', class_='pagination-next') + if next_page_element: + next_page_url = next_page_element['href'] + next_page_number = url_page_num(next_page_url) + current_page_number = url_page_num(href) + if current_page_number is None: + current_page_number = 0 + if next_page_number and next_page_number > current_page_number : + next_url = host_url + next_page_url + + return list_data, next_url + + ###### 以下为测试代码 ###### def test_actors_list(): @@ -591,12 +629,25 @@ def test_series_detail(): print(all_data) +def test_uncensored(): + next_url = 'https://javdb.com/search?from_recent=1&q=%E6%97%A0%E7%A0%81%E6%B5%81%E5%87%BA' + all_data = [] + while next_url: + print(f'fetching page {next_url}') + soup, status_code = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-8", attr_type="class")) + if soup: + list_data, next_url = parse_uncensored(soup, next_url) + if list_data : + all_data.extend(list_data) + else: + print('get wrong page.') + utils.json_to_csv(all_data, 'uncensored.csv') if __name__ == "__main__": #test_actors_list() #test_actor() - test_movie_detail() + #test_movie_detail() #test_series_list() #test_series_detail() - + test_uncensored() \ No newline at end of file diff --git a/javdb/src/utils.py b/javdb/src/utils.py index c8e0ff9..c122e20 100644 --- a/javdb/src/utils.py +++ b/javdb/src/utils.py @@ -106,3 +106,13 @@ def remove_url_query(url: str) -> str: except Exception as e: print(f"解析 URL 失败: {e}") return url +# 写csv文件 +def json_to_csv(data, output_file): + if not data: + return + headers = list(data[0].keys()) + with open(f"{update_dir}/{output_file}", 'w', encoding='utf-8', newline='') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=headers) + writer.writeheader() + for row in data: + writer.writerow(row)