modify aabook codes.
This commit is contained in:
132
scripts/javdb/get_javdb.py
Normal file
132
scripts/javdb/get_javdb.py
Normal file
@ -0,0 +1,132 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 获取 javdb 数据, prompt:
|
||||
我们需要访问 https://javdb.com/search?f=all&page={p}&q={str} 这个地址,并返回数据,以下是需求详细描述:
|
||||
q 参数,我们有一个数组,分别是 qlist = ['MKBD', 'LAFBD', 'S2MBD', 'SKYHD', 'SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
|
||||
p 参数,是要访问的页码,它通常从1开始。
|
||||
|
||||
我们循环遍历 qlist,对每一个值,从 p=1 开始,组成一个访问的 URL, 获取该 URL 的内容,它是一个页面;
|
||||
对页面内容,循环读取每一行,进行查找:
|
||||
如果能匹配 <div class="video-title"><strong>SHIIKU-001</strong> 性奴●飼育マニュアル THE MOVIE</div> 这个格式,那么我们把其中标签修饰的两段文本找出来,分别记为 str1 和str2,然后输出 str1__str2 这样的格式;如果格式不匹配,则不输出;
|
||||
如果匹配 <div class="meta">这个格式,那么读取它的下一行,去掉空格与tab符号之后,会剩下一个日期字符串,把这个字符串记为 pubdate;
|
||||
我们会得到 str1__pubdate__str2 这样的文本,把它保存到一个变量 res 中;
|
||||
继续遍历页面,如果找到匹配 <a rel="next" class="pagination-next" href="/search?f=all&page=5&q=SMBD">下一頁</a> 格式的一行,说明还有下一页,把其中的 page=5 的数字提取出来,修改上面的 URL,填入新的 p值,继续访问;如果无法匹配,那就代表着结束,我们把 res 输出到一个文件中,它命名为 {q}_all.txt
|
||||
|
||||
请你理解上述需求,并写出对应的python代码。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import requests
|
||||
#from bs4 import BeautifulSoup
|
||||
import re
|
||||
import time
|
||||
|
||||
# 参数定义
|
||||
qlist = ['MKBD', 'LAFBD', 'S2MBD', 'SKYHD', 'SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
|
||||
base_url = "https://javdb.com/search?f=all&page={}&q={}"
|
||||
|
||||
# 临时跑数据
|
||||
qlist = ['SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
|
||||
|
||||
# 正则表达式匹配模式
|
||||
title_pattern = r'<div class="video-title"><strong>(.*?)</strong>\s*(.*?)</div>'
|
||||
meta_pattern = r'<div class="meta">'
|
||||
next_page_pattern = r'<a rel="next" class="pagination-next" href=".*?page=(\d+)&q='
|
||||
|
||||
def get_page_content(url):
|
||||
"""发送请求并获取页面内容"""
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
else:
|
||||
print(f"请求失败,状态码:{response.status_code}")
|
||||
return None
|
||||
|
||||
def parse_page(content):
|
||||
"""解析页面内容,提取标题、日期和下一页信息"""
|
||||
#soup = BeautifulSoup(content, 'html.parser')
|
||||
res = []
|
||||
next_page = None
|
||||
|
||||
lines = content.split('\n') # 将页面按行分割
|
||||
str1, str2, pubdate = None, None, None
|
||||
meta_found = False
|
||||
|
||||
for idx, line in enumerate(lines):
|
||||
# 尝试匹配标题
|
||||
title_match = re.search(title_pattern, line)
|
||||
if title_match:
|
||||
str1 = title_match.group(1).strip()
|
||||
str2 = title_match.group(2).strip()
|
||||
|
||||
# 尝试匹配 <div class="meta">
|
||||
if re.search(meta_pattern, line):
|
||||
meta_found = True
|
||||
continue
|
||||
|
||||
# 如果上一行是 <div class="meta">,则处理下一行的日期
|
||||
if meta_found:
|
||||
pubdate = line.strip()
|
||||
meta_found = False
|
||||
|
||||
# 如果标题和日期都匹配到了,存储结果
|
||||
if str1 and str2 and pubdate:
|
||||
res.append(f"{str1}__{pubdate}__{str2}")
|
||||
str1, str2, pubdate = None, None, None
|
||||
|
||||
# 尝试匹配下一页链接
|
||||
next_page_match = re.search(next_page_pattern, line)
|
||||
if next_page_match:
|
||||
next_page = next_page_match.group(1)
|
||||
|
||||
return res, next_page
|
||||
|
||||
def scrape_videos_for_q(q):
|
||||
"""对指定的q参数进行抓取"""
|
||||
p = 1
|
||||
res = []
|
||||
while True:
|
||||
# 构建 URL
|
||||
url = base_url.format(p, q)
|
||||
print(f"正在访问:{url}")
|
||||
page_content = get_page_content(url)
|
||||
|
||||
if page_content:
|
||||
# 解析页面内容
|
||||
results, next_page = parse_page(page_content)
|
||||
res.extend(results)
|
||||
|
||||
# 如果有下一页,继续,否则结束
|
||||
if next_page:
|
||||
p = int(next_page)
|
||||
time.sleep(5) # 避免请求过快
|
||||
else:
|
||||
break
|
||||
else:
|
||||
print(f"未能获取页面内容,跳过 q={q} 的处理")
|
||||
break
|
||||
|
||||
# 将结果保存到文件中
|
||||
if res:
|
||||
output_filename = f"./javdb/{q}_all.txt"
|
||||
with open(output_filename, 'w', encoding='utf-8') as f:
|
||||
f.write("\n".join(res))
|
||||
print(f"已保存结果到 {output_filename}")
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
for q in qlist:
|
||||
scrape_videos_for_q(q)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user