This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/javdb/get_javdb.py
2025-03-17 11:30:35 +08:00

132 lines
4.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Script Name:
Description: 获取 javdb 数据, prompt:
我们需要访问 https://javdb.com/search?f=all&page={p}&q={str} 这个地址,并返回数据,以下是需求详细描述:
q 参数,我们有一个数组,分别是 qlist = ['MKBD', 'LAFBD', 'S2MBD', 'SKYHD', 'SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
p 参数是要访问的页码它通常从1开始。
我们循环遍历 qlist对每一个值从 p=1 开始,组成一个访问的 URL 获取该 URL 的内容,它是一个页面;
对页面内容,循环读取每一行,进行查找:
如果能匹配 <div class="video-title"><strong>SHIIKU-001</strong> 性奴●飼育マニュアル THE MOVIE</div> 这个格式,那么我们把其中标签修饰的两段文本找出来,分别记为 str1 和str2然后输出 str1__str2 这样的格式;如果格式不匹配,则不输出;
如果匹配 <div class="meta">这个格式那么读取它的下一行去掉空格与tab符号之后会剩下一个日期字符串把这个字符串记为 pubdate
我们会得到 str1__pubdate__str2 这样的文本,把它保存到一个变量 res 中;
继续遍历页面,如果找到匹配 <a rel="next" class="pagination-next" href="/search?f=all&amp;page=5&amp;q=SMBD">下一頁</a> 格式的一行,说明还有下一页,把其中的 page=5 的数字提取出来,修改上面的 URL填入新的 p值继续访问如果无法匹配那就代表着结束我们把 res 输出到一个文件中,它命名为 {q}_all.txt
请你理解上述需求并写出对应的python代码。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import requests
#from bs4 import BeautifulSoup
import re
import time
# 参数定义
qlist = ['MKBD', 'LAFBD', 'S2MBD', 'SKYHD', 'SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
base_url = "https://javdb.com/search?f=all&page={}&q={}"
# 临时跑数据
qlist = ['SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
# 正则表达式匹配模式
title_pattern = r'<div class="video-title"><strong>(.*?)</strong>\s*(.*?)</div>'
meta_pattern = r'<div class="meta">'
next_page_pattern = r'<a rel="next" class="pagination-next" href=".*?page=(\d+)&amp;q='
def get_page_content(url):
"""发送请求并获取页面内容"""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"请求失败,状态码:{response.status_code}")
return None
def parse_page(content):
"""解析页面内容,提取标题、日期和下一页信息"""
#soup = BeautifulSoup(content, 'html.parser')
res = []
next_page = None
lines = content.split('\n') # 将页面按行分割
str1, str2, pubdate = None, None, None
meta_found = False
for idx, line in enumerate(lines):
# 尝试匹配标题
title_match = re.search(title_pattern, line)
if title_match:
str1 = title_match.group(1).strip()
str2 = title_match.group(2).strip()
# 尝试匹配 <div class="meta">
if re.search(meta_pattern, line):
meta_found = True
continue
# 如果上一行是 <div class="meta">,则处理下一行的日期
if meta_found:
pubdate = line.strip()
meta_found = False
# 如果标题和日期都匹配到了,存储结果
if str1 and str2 and pubdate:
res.append(f"{str1}__{pubdate}__{str2}")
str1, str2, pubdate = None, None, None
# 尝试匹配下一页链接
next_page_match = re.search(next_page_pattern, line)
if next_page_match:
next_page = next_page_match.group(1)
return res, next_page
def scrape_videos_for_q(q):
"""对指定的q参数进行抓取"""
p = 1
res = []
while True:
# 构建 URL
url = base_url.format(p, q)
print(f"正在访问:{url}")
page_content = get_page_content(url)
if page_content:
# 解析页面内容
results, next_page = parse_page(page_content)
res.extend(results)
# 如果有下一页,继续,否则结束
if next_page:
p = int(next_page)
time.sleep(5) # 避免请求过快
else:
break
else:
print(f"未能获取页面内容,跳过 q={q} 的处理")
break
# 将结果保存到文件中
if res:
output_filename = f"./javdb/{q}_all.txt"
with open(output_filename, 'w', encoding='utf-8') as f:
f.write("\n".join(res))
print(f"已保存结果到 {output_filename}")
def main():
"""主函数"""
for q in qlist:
scrape_videos_for_q(q)
if __name__ == "__main__":
main()