132 lines
4.9 KiB
Python
132 lines
4.9 KiB
Python
"""
|
||
Script Name:
|
||
Description: 获取 javdb 数据, prompt:
|
||
我们需要访问 https://javdb.com/search?f=all&page={p}&q={str} 这个地址,并返回数据,以下是需求详细描述:
|
||
q 参数,我们有一个数组,分别是 qlist = ['MKBD', 'LAFBD', 'S2MBD', 'SKYHD', 'SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
|
||
p 参数,是要访问的页码,它通常从1开始。
|
||
|
||
我们循环遍历 qlist,对每一个值,从 p=1 开始,组成一个访问的 URL, 获取该 URL 的内容,它是一个页面;
|
||
对页面内容,循环读取每一行,进行查找:
|
||
如果能匹配 <div class="video-title"><strong>SHIIKU-001</strong> 性奴●飼育マニュアル THE MOVIE</div> 这个格式,那么我们把其中标签修饰的两段文本找出来,分别记为 str1 和str2,然后输出 str1__str2 这样的格式;如果格式不匹配,则不输出;
|
||
如果匹配 <div class="meta">这个格式,那么读取它的下一行,去掉空格与tab符号之后,会剩下一个日期字符串,把这个字符串记为 pubdate;
|
||
我们会得到 str1__pubdate__str2 这样的文本,把它保存到一个变量 res 中;
|
||
继续遍历页面,如果找到匹配 <a rel="next" class="pagination-next" href="/search?f=all&page=5&q=SMBD">下一頁</a> 格式的一行,说明还有下一页,把其中的 page=5 的数字提取出来,修改上面的 URL,填入新的 p值,继续访问;如果无法匹配,那就代表着结束,我们把 res 输出到一个文件中,它命名为 {q}_all.txt
|
||
|
||
请你理解上述需求,并写出对应的python代码。
|
||
|
||
Author: [Your Name]
|
||
Created Date: YYYY-MM-DD
|
||
Last Modified: YYYY-MM-DD
|
||
Version: 1.0
|
||
|
||
|
||
Modification History:
|
||
- YYYY-MM-DD [Your Name]:
|
||
- YYYY-MM-DD [Your Name]:
|
||
- YYYY-MM-DD [Your Name]:
|
||
"""
|
||
|
||
import requests
|
||
#from bs4 import BeautifulSoup
|
||
import re
|
||
import time
|
||
|
||
# 参数定义
|
||
qlist = ['MKBD', 'LAFBD', 'S2MBD', 'SKYHD', 'SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
|
||
base_url = "https://javdb.com/search?f=all&page={}&q={}"
|
||
|
||
# 临时跑数据
|
||
qlist = ['SMBD', 'CWPBD', 'DRGBD', 'DSAMBD']
|
||
|
||
# 正则表达式匹配模式
|
||
title_pattern = r'<div class="video-title"><strong>(.*?)</strong>\s*(.*?)</div>'
|
||
meta_pattern = r'<div class="meta">'
|
||
next_page_pattern = r'<a rel="next" class="pagination-next" href=".*?page=(\d+)&q='
|
||
|
||
def get_page_content(url):
|
||
"""发送请求并获取页面内容"""
|
||
response = requests.get(url)
|
||
if response.status_code == 200:
|
||
return response.text
|
||
else:
|
||
print(f"请求失败,状态码:{response.status_code}")
|
||
return None
|
||
|
||
def parse_page(content):
|
||
"""解析页面内容,提取标题、日期和下一页信息"""
|
||
#soup = BeautifulSoup(content, 'html.parser')
|
||
res = []
|
||
next_page = None
|
||
|
||
lines = content.split('\n') # 将页面按行分割
|
||
str1, str2, pubdate = None, None, None
|
||
meta_found = False
|
||
|
||
for idx, line in enumerate(lines):
|
||
# 尝试匹配标题
|
||
title_match = re.search(title_pattern, line)
|
||
if title_match:
|
||
str1 = title_match.group(1).strip()
|
||
str2 = title_match.group(2).strip()
|
||
|
||
# 尝试匹配 <div class="meta">
|
||
if re.search(meta_pattern, line):
|
||
meta_found = True
|
||
continue
|
||
|
||
# 如果上一行是 <div class="meta">,则处理下一行的日期
|
||
if meta_found:
|
||
pubdate = line.strip()
|
||
meta_found = False
|
||
|
||
# 如果标题和日期都匹配到了,存储结果
|
||
if str1 and str2 and pubdate:
|
||
res.append(f"{str1}__{pubdate}__{str2}")
|
||
str1, str2, pubdate = None, None, None
|
||
|
||
# 尝试匹配下一页链接
|
||
next_page_match = re.search(next_page_pattern, line)
|
||
if next_page_match:
|
||
next_page = next_page_match.group(1)
|
||
|
||
return res, next_page
|
||
|
||
def scrape_videos_for_q(q):
|
||
"""对指定的q参数进行抓取"""
|
||
p = 1
|
||
res = []
|
||
while True:
|
||
# 构建 URL
|
||
url = base_url.format(p, q)
|
||
print(f"正在访问:{url}")
|
||
page_content = get_page_content(url)
|
||
|
||
if page_content:
|
||
# 解析页面内容
|
||
results, next_page = parse_page(page_content)
|
||
res.extend(results)
|
||
|
||
# 如果有下一页,继续,否则结束
|
||
if next_page:
|
||
p = int(next_page)
|
||
time.sleep(5) # 避免请求过快
|
||
else:
|
||
break
|
||
else:
|
||
print(f"未能获取页面内容,跳过 q={q} 的处理")
|
||
break
|
||
|
||
# 将结果保存到文件中
|
||
if res:
|
||
output_filename = f"./javdb/{q}_all.txt"
|
||
with open(output_filename, 'w', encoding='utf-8') as f:
|
||
f.write("\n".join(res))
|
||
print(f"已保存结果到 {output_filename}")
|
||
|
||
def main():
|
||
"""主函数"""
|
||
for q in qlist:
|
||
scrape_videos_for_q(q)
|
||
|
||
if __name__ == "__main__":
|
||
main() |