modify scripts
This commit is contained in:
593
src/crawling/craw.py
Normal file
593
src/crawling/craw.py
Normal file
@ -0,0 +1,593 @@
|
|||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
import src.utils.utils as utils
|
||||||
|
|
||||||
|
http_code_404 = 404
|
||||||
|
http_code_redirect = 401
|
||||||
|
http_code_url = 601
|
||||||
|
|
||||||
|
# 通用的爬取类,主要实现了底层的网络交互封装
|
||||||
|
class GenericCrawler:
|
||||||
|
def __init__(self, use_cloudscraper=None, headers=None, cookies=None, max_retries=3, html_parser='html.parser'):
|
||||||
|
if use_cloudscraper is None:
|
||||||
|
use_cloudscraper = sys.version_info >= (3, 8)
|
||||||
|
self.use_cloudscraper = use_cloudscraper
|
||||||
|
self.headers = headers or {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0'
|
||||||
|
}
|
||||||
|
self.cookies = cookies or {}
|
||||||
|
self.scraper = None # 延迟初始化
|
||||||
|
self.max_retries = max_retries
|
||||||
|
self.parser = html_parser
|
||||||
|
|
||||||
|
# 不在这里导入 cloudscraper,而是在需要时导入
|
||||||
|
|
||||||
|
def _initialize_scraper(self):
|
||||||
|
"""延迟初始化请求客户端,避免不必要的 cloudscraper 导入"""
|
||||||
|
if self.scraper is not None:
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.use_cloudscraper:
|
||||||
|
try:
|
||||||
|
# 延迟导入 cloudscraper
|
||||||
|
import cloudscraper
|
||||||
|
self.scraper = cloudscraper.create_scraper()
|
||||||
|
logging.info("Using cloudscraper for requests")
|
||||||
|
except ImportError:
|
||||||
|
logging.warning("cloudscraper not installed. Falling back to requests.")
|
||||||
|
self.use_cloudscraper = False
|
||||||
|
self.scraper = requests.Session()
|
||||||
|
else:
|
||||||
|
self.scraper = requests.Session()
|
||||||
|
logging.info("Using requests for HTTP operations")
|
||||||
|
|
||||||
|
def fetch_page(self, url, validator):
|
||||||
|
# 在使用前初始化 scraper
|
||||||
|
self._initialize_scraper()
|
||||||
|
|
||||||
|
for attempt in range(self.max_retries):
|
||||||
|
try:
|
||||||
|
if not utils.is_valid_url(url):
|
||||||
|
logging.error(f'wrong url format: {url}')
|
||||||
|
return None, http_code_url
|
||||||
|
|
||||||
|
response = self.scraper.get(url, headers=self.headers, cookies=self.cookies)
|
||||||
|
|
||||||
|
# 处理 HTTP 状态码
|
||||||
|
if response.status_code == http_code_404:
|
||||||
|
logging.debug(f"Page not found (404): {url}")
|
||||||
|
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
|
# 检查是否发生跳转,比如到登录页面
|
||||||
|
if response.history:
|
||||||
|
logging.debug(f"Page redirected on {url}. Checking if it's a verify page.")
|
||||||
|
soup = BeautifulSoup(response.text, self.parser)
|
||||||
|
if self.check_redirect(soup) :
|
||||||
|
logging.warning(f"Page redirected to verify page on {url}.")
|
||||||
|
return None, http_code_redirect
|
||||||
|
|
||||||
|
# 判断是否为登录页面
|
||||||
|
#if soup.find('div', id='ageVerify'):
|
||||||
|
|
||||||
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
|
html_text = self.preprocessor(response.text)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_text, self.parser)
|
||||||
|
if validator(soup): # 进行自定义页面检查
|
||||||
|
return soup, response.status_code
|
||||||
|
|
||||||
|
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Unexpected error on {url}: {e}, Retrying...")
|
||||||
|
|
||||||
|
logging.error(f'Fetching failed after max retries. {url}')
|
||||||
|
return None, None # 达到最大重试次数仍然失败
|
||||||
|
|
||||||
|
# 对页面的预处理,通常是修复标签之类的
|
||||||
|
def preprocessor(self, html):
|
||||||
|
return html
|
||||||
|
|
||||||
|
# 检查是否发生了跳转,偏离了正常解析
|
||||||
|
def check_redirect(self, soup):
|
||||||
|
"""默认的页面验证器,子类可重写"""
|
||||||
|
return False # 默认验证通过
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||||
|
if attr_type == "id":
|
||||||
|
return soup.find(tag, id=identifier) is not None
|
||||||
|
elif attr_type == "class":
|
||||||
|
return bool(soup.find_all(tag, class_=identifier))
|
||||||
|
elif attr_type == "name":
|
||||||
|
return bool(soup.find('select', {'name': identifier}))
|
||||||
|
return False
|
||||||
|
|
||||||
|
# javbus.com 网页爬取类
|
||||||
|
class JavbusCrawler(GenericCrawler):
|
||||||
|
def __init__(self, use_cloudscraper=None):
|
||||||
|
headers = {
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Host": "www.javbus.com",
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
|
||||||
|
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies = {
|
||||||
|
'PHPSESSID': 'l9m4ugaaao1hgvl3micr22u3o6',
|
||||||
|
'existmag': 'all',
|
||||||
|
'age': 'verified'
|
||||||
|
}
|
||||||
|
super().__init__(use_cloudscraper, headers=headers, cookies=cookies)
|
||||||
|
self.host_url = "https://www.javbus.com"
|
||||||
|
|
||||||
|
# 以下是原有的解析函数,保持不变
|
||||||
|
def parse_actors_list(self, soup, href):
|
||||||
|
div_actors = soup.find("div", id='waterfall')
|
||||||
|
if not div_actors:
|
||||||
|
logging.warning(f"Warning: No actors div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_actors.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
# 获取演员详情链接
|
||||||
|
actor_link = row.find('a')['href']
|
||||||
|
# 获取演员名字
|
||||||
|
actor_name = row.find('span').text.strip()
|
||||||
|
# 获取头像图片链接
|
||||||
|
avatar_url = row.find('img')['src']
|
||||||
|
|
||||||
|
list_data.append({
|
||||||
|
'name': actor_name,
|
||||||
|
'href': urljoin(self.host_url, actor_link),
|
||||||
|
'pic': avatar_url
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
div_link = soup.find("div", class_='text-center hidden-xs')
|
||||||
|
if div_link:
|
||||||
|
next_page_element = soup.find('a', id='next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_url = urljoin(href, next_page_url)
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
def parse_actor_detail(self, soup, href):
|
||||||
|
# 先找一下别名
|
||||||
|
alias_list = []
|
||||||
|
|
||||||
|
div_meta = soup.find('span', class_='actor-section-name')
|
||||||
|
if not div_meta:
|
||||||
|
logging.warning(f'warning: no meta data found in page {href}')
|
||||||
|
return None, None
|
||||||
|
alias_div = soup.find('div', class_='column section-title')
|
||||||
|
|
||||||
|
if alias_div:
|
||||||
|
meta_list = alias_div.find_all('span', class_='section-meta')
|
||||||
|
if len(meta_list) > 1:
|
||||||
|
alias_list = meta_list[0].text.strip().split(", ")
|
||||||
|
|
||||||
|
# 头像
|
||||||
|
pic = ''
|
||||||
|
avatar = soup.find("div", class_="column actor-avatar")
|
||||||
|
if avatar:
|
||||||
|
pic = self.parse_avatar_image(avatar)
|
||||||
|
|
||||||
|
# 返回数据
|
||||||
|
actor = {}
|
||||||
|
|
||||||
|
# 使用正则表达式查找 class 包含 'movie-list h cols-4' 的 div 元素
|
||||||
|
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-'))
|
||||||
|
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||||
|
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href': host_url + link if link else '',
|
||||||
|
'serial_number': serial_number,
|
||||||
|
'title': title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = self.url_page_num(next_page_url)
|
||||||
|
current_page_number = self.url_page_num(href)
|
||||||
|
logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number:
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
actor = {
|
||||||
|
'pic': pic,
|
||||||
|
'alias': alias_list,
|
||||||
|
'movies': list_data
|
||||||
|
}
|
||||||
|
|
||||||
|
return actor, next_url
|
||||||
|
|
||||||
|
def parse_movie_one(self, soup, keys):
|
||||||
|
key_strong = soup.find('strong', string=lambda text: text in keys)
|
||||||
|
if key_strong:
|
||||||
|
key_span = key_strong.find_next_sibling('span', class_='value')
|
||||||
|
if key_span:
|
||||||
|
return key_span.text.strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_movie_val_href(self, soup, keys):
|
||||||
|
key_strong = soup.find('strong', string=lambda text: text in keys)
|
||||||
|
if key_strong:
|
||||||
|
key_span = key_strong.find_next_sibling('span', class_='value')
|
||||||
|
if key_span:
|
||||||
|
a_tag = key_span.find('a')
|
||||||
|
if a_tag:
|
||||||
|
return a_tag.text.strip(), host_url + a_tag.get('href')
|
||||||
|
else:
|
||||||
|
return key_span.text.strip(), None
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def parse_movie_arr(self, soup, keys):
|
||||||
|
key_strong = soup.find('strong', string=lambda text: text in keys)
|
||||||
|
if key_strong:
|
||||||
|
key_span = key_strong.find_next_sibling('span', class_='value')
|
||||||
|
if key_span:
|
||||||
|
actors = []
|
||||||
|
a_tags = key_span.find_all('a')
|
||||||
|
for a_tag in a_tags:
|
||||||
|
actors.append({
|
||||||
|
'name': a_tag.text.strip(),
|
||||||
|
'href': host_url + a_tag.get('href')
|
||||||
|
})
|
||||||
|
return actors
|
||||||
|
return []
|
||||||
|
|
||||||
|
def parse_movie_detail(self, soup, href, title):
|
||||||
|
div_video = soup.find("div", class_='video-meta-panel')
|
||||||
|
if not div_video:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
result['href'] = href
|
||||||
|
result['title'] = title
|
||||||
|
|
||||||
|
# 获取封面图片
|
||||||
|
cover_img = soup.select_one('.column-video-cover a')
|
||||||
|
result['cover_url'] = cover_img['href'] if cover_img else None
|
||||||
|
|
||||||
|
# 获取番号
|
||||||
|
result['serial_number'] = self.parse_movie_one(soup, ['番號:', 'ID:'])
|
||||||
|
result['release_date'] = self.parse_movie_one(soup, ['日期:', 'Released Date:'])
|
||||||
|
result['duration'] = self.parse_movie_one(soup, ['時長:', 'Duration:'])
|
||||||
|
|
||||||
|
# 获取maker,系列
|
||||||
|
result['maker_name'], result['maker_link'] = self.parse_movie_val_href(soup, ['片商:', 'Maker:'])
|
||||||
|
result['series_name'], result['series_link'] = self.parse_movie_val_href(soup, ['系列:', 'Series:'])
|
||||||
|
result['pub_name'], result['pub_link'] = self.parse_movie_val_href(soup, ['發行:', 'Publisher:'])
|
||||||
|
|
||||||
|
# 获取演员,tags
|
||||||
|
result['tags'] = self.parse_movie_arr(soup, ['類別:', 'Tags:'])
|
||||||
|
result['actors'] = self.parse_movie_arr(soup, ['演員:', 'Actor(s):'])
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def parse_series_uncensored(self, soup, href):
|
||||||
|
div_series = soup.find("div", id='series')
|
||||||
|
if not div_series:
|
||||||
|
logging.warning(f"Warning: No div_series div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_series.find_all('a', class_='box')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
name = row.find('strong').text.strip()
|
||||||
|
href = row['href']
|
||||||
|
div_movies = row.find('span')
|
||||||
|
movies = 0
|
||||||
|
if div_movies:
|
||||||
|
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
||||||
|
if match:
|
||||||
|
movies = int(match.group(1))
|
||||||
|
|
||||||
|
list_data.append({
|
||||||
|
'name': name,
|
||||||
|
'href': host_url + href if href else '',
|
||||||
|
'movies': movies
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = self.url_page_num(next_page_url)
|
||||||
|
current_page_number = self.url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number:
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
def parse_series_detail(self, soup, href):
|
||||||
|
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||||
|
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return [], None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href': host_url + link if link else '',
|
||||||
|
'serial_number': serial_number,
|
||||||
|
'title': title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = self.url_page_num(next_page_url)
|
||||||
|
current_page_number = self.url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number:
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
def parse_makers_uncensored(self, soup, href):
|
||||||
|
div_series = soup.find("div", id='makers')
|
||||||
|
if not div_series:
|
||||||
|
logging.warning(f"Warning: No makers div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_series.find_all('a', class_='box')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
name = row.find('strong').text.strip()
|
||||||
|
href = row['href']
|
||||||
|
div_movies = row.find('span')
|
||||||
|
movies = 0
|
||||||
|
if div_movies:
|
||||||
|
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
||||||
|
if match:
|
||||||
|
movies = int(match.group(1))
|
||||||
|
|
||||||
|
list_data.append({
|
||||||
|
'name': name,
|
||||||
|
'href': host_url + href if href else '',
|
||||||
|
'movies': movies
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = self.url_page_num(next_page_url)
|
||||||
|
current_page_number = self.url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number:
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
def parse_maker_detail(self, soup, href):
|
||||||
|
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||||
|
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return [], None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href': host_url + link if link else '',
|
||||||
|
'serial_number': serial_number,
|
||||||
|
'title': title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = self.url_page_num(next_page_url)
|
||||||
|
current_page_number = self.url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number:
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
def parse_publisher_detail(self, soup, href):
|
||||||
|
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||||
|
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return [], None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href': host_url + link if link else '',
|
||||||
|
'serial_number': serial_number,
|
||||||
|
'title': title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = self.url_page_num(next_page_url)
|
||||||
|
current_page_number = self.url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number:
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
def parse_uncensored(self, soup, href):
|
||||||
|
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
|
||||||
|
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return [], None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href': host_url + link if link else '',
|
||||||
|
'serial_number': serial_number,
|
||||||
|
'title': title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = self.url_page_num(next_page_url)
|
||||||
|
current_page_number = self.url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number:
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def pretty_print_json(data, n=10, indent=4, sort_keys=False):
|
||||||
|
"""
|
||||||
|
以美化格式打印数组的前n个元素,其他元素用"..."表示
|
||||||
|
|
||||||
|
参数:
|
||||||
|
- data: 要打印的数据(应为数组)
|
||||||
|
- n: 要显示的元素数量
|
||||||
|
- indent: 缩进空格数
|
||||||
|
- sort_keys: 是否按键排序
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 处理非数组数据
|
||||||
|
if not isinstance(data, list):
|
||||||
|
print(formatted)
|
||||||
|
return
|
||||||
|
|
||||||
|
# 复制原始数据,避免修改原数组
|
||||||
|
data_copy = data.copy()
|
||||||
|
|
||||||
|
# 切片取前n个元素
|
||||||
|
first_n_elements = data_copy[:n]
|
||||||
|
|
||||||
|
# 如果数组长度超过n,添加"..."标记
|
||||||
|
if len(data) > n:
|
||||||
|
result = first_n_elements + ["... ({} more elements)".format(len(data) - n)]
|
||||||
|
else:
|
||||||
|
result = first_n_elements
|
||||||
|
|
||||||
|
# 格式化输出
|
||||||
|
formatted = json.dumps(result, indent=indent, ensure_ascii=False, sort_keys=sort_keys)
|
||||||
|
print(formatted)
|
||||||
|
|
||||||
|
except TypeError as e:
|
||||||
|
print(f"错误:无法格式化数据。详情:{e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"打印时发生意外错误:{e}")
|
||||||
|
|
||||||
|
def test_actor_list(self, url='https://www.javbus.com/uncensored/actresses/1'):
|
||||||
|
next_url = url
|
||||||
|
all_data = []
|
||||||
|
while next_url:
|
||||||
|
print(f'fetching page {next_url}')
|
||||||
|
soup, status_code = self.fetch_page(next_url, partial(self.generic_validator, tag="div", identifier="waterfall", attr_type="id"),
|
||||||
|
max_retries=1)
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = self.parse_actors_list(soup, next_url)
|
||||||
|
if list_data:
|
||||||
|
all_data.extend(list_data)
|
||||||
|
self.pretty_print_json(all_data)
|
||||||
|
else:
|
||||||
|
print('get wrong page.')
|
||||||
|
|
||||||
|
if next_url:
|
||||||
|
print(f"\n\nnext url: {next_url}")
|
||||||
|
else:
|
||||||
|
print(f"wrong request. url: {next_url}, status_code: {status_code}")
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
def url_page_num(self, url):
|
||||||
|
# 这里需要根据实际情况实现提取页码的逻辑
|
||||||
|
return None
|
||||||
@ -1,71 +0,0 @@
|
|||||||
import logging
|
|
||||||
import cloudscraper
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import src.utils.utils as utils
|
|
||||||
|
|
||||||
# 设置 headers 和 scraper
|
|
||||||
headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0'
|
|
||||||
}
|
|
||||||
# 定义 cookie
|
|
||||||
cookies = {
|
|
||||||
}
|
|
||||||
scraper = cloudscraper.create_scraper()
|
|
||||||
|
|
||||||
http_code_404 = 404
|
|
||||||
http_code_login = 401
|
|
||||||
http_code_local = 99
|
|
||||||
logging.getLogger().setLevel(logging.DEBUG)
|
|
||||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
|
||||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None, headers=headers, cookies=cookies):
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
if not utils.is_valid_url(url):
|
|
||||||
logging.error(f'wrong url format: {url}')
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
response = scraper.get(url, headers=headers, cookies=cookies)
|
|
||||||
|
|
||||||
# 处理 HTTP 状态码
|
|
||||||
if response.status_code == 404:
|
|
||||||
logging.debug(f"Page not found (404): {url}")
|
|
||||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
|
||||||
|
|
||||||
response.raise_for_status() # 处理 HTTP 错误
|
|
||||||
|
|
||||||
# 检查是否发生跳转,比如到登录页面
|
|
||||||
if response.history:
|
|
||||||
logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
|
|
||||||
soup = BeautifulSoup(response.text, parser)
|
|
||||||
# 判断是否为登录页面,
|
|
||||||
if soup.find('div', id='ageVerify'):
|
|
||||||
logging.warning(f"Page redirected to login page on {url}.")
|
|
||||||
return None, http_code_login
|
|
||||||
|
|
||||||
# 预处理 HTML(如果提供了 preprocessor)
|
|
||||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
|
||||||
|
|
||||||
soup = BeautifulSoup(html_text, parser)
|
|
||||||
if validator(soup): # 进行自定义页面检查
|
|
||||||
return soup, response.status_code
|
|
||||||
|
|
||||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
|
||||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
|
||||||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
|
||||||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
|
||||||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
|
||||||
|
|
||||||
logging.error(f'Fetching failed after max retries. {url}')
|
|
||||||
return None, None # 达到最大重试次数仍然失败
|
|
||||||
|
|
||||||
# 通用的 HTML 结构验证器
|
|
||||||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
|
||||||
if attr_type == "id":
|
|
||||||
return soup.find(tag, id=identifier) is not None
|
|
||||||
elif attr_type == "class":
|
|
||||||
return bool(soup.find_all(tag, class_=identifier))
|
|
||||||
elif attr_type == "name":
|
|
||||||
return bool(soup.find('select', {'name': identifier}))
|
|
||||||
return False
|
|
||||||
@ -1,515 +0,0 @@
|
|||||||
import cloudscraper
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
import json
|
|
||||||
from functools import partial
|
|
||||||
from urllib.parse import urljoin
|
|
||||||
import src.config.config as config
|
|
||||||
import src.utils.utils as utils
|
|
||||||
import src.crawling.craw_common as scraper
|
|
||||||
|
|
||||||
# 定义基础 URL 和可变参数
|
|
||||||
host_url = "https://www.javbus.com"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
||||||
"Sec-Fetch-Site": "none",
|
|
||||||
"Accept-Encoding": "gzip, deflate, br",
|
|
||||||
"Sec-Fetch-Mode": "navigate",
|
|
||||||
"Host": "www.javbus.com",
|
|
||||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
|
|
||||||
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
|
|
||||||
"Sec-Fetch-Dest": "document",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
}
|
|
||||||
|
|
||||||
cookies = {
|
|
||||||
'PHPSESSID': 'l9m4ugaaao1hgvl3micr22u3o6',
|
|
||||||
'existmag': 'all',
|
|
||||||
'age': 'verified'
|
|
||||||
}
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_actors_list(soup, href):
|
|
||||||
div_actors = soup.find("div", id='waterfall')
|
|
||||||
if not div_actors:
|
|
||||||
logging.warning(f"Warning: No actors div found ")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# 解析元素
|
|
||||||
rows = div_actors.find_all('div', class_='item')
|
|
||||||
|
|
||||||
list_data = []
|
|
||||||
next_url = None
|
|
||||||
for row in rows:
|
|
||||||
# 获取演员详情链接
|
|
||||||
actor_link = row.find('a')['href']
|
|
||||||
# 获取演员名字
|
|
||||||
actor_name = row.find('span').text.strip()
|
|
||||||
# 获取头像图片链接
|
|
||||||
avatar_url = row.find('img')['src']
|
|
||||||
|
|
||||||
list_data.append({
|
|
||||||
'name' : actor_name,
|
|
||||||
'href' : urljoin(host_url, actor_link),
|
|
||||||
'pic' : avatar_url
|
|
||||||
})
|
|
||||||
|
|
||||||
# 查找 "下一页" 按钮
|
|
||||||
div_link = soup.find("div", class_='text-center hidden-xs')
|
|
||||||
if div_link:
|
|
||||||
next_page_element = soup.find('a', id='next')
|
|
||||||
if next_page_element:
|
|
||||||
next_page_url = next_page_element['href']
|
|
||||||
next_url = urljoin(href, next_page_url)
|
|
||||||
|
|
||||||
return list_data, next_url
|
|
||||||
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_actor_detail(soup, href):
|
|
||||||
# 先找一下别名
|
|
||||||
alias_list = []
|
|
||||||
|
|
||||||
div_meta = soup.find('span', class_='actor-section-name')
|
|
||||||
if not div_meta:
|
|
||||||
logging.warning(f'warning: no meta data found in page {href}')
|
|
||||||
return None, None
|
|
||||||
alias_div = soup.find('div', class_='column section-title')
|
|
||||||
|
|
||||||
if alias_div:
|
|
||||||
meta_list = alias_div.find_all('span', class_='section-meta')
|
|
||||||
if len(meta_list) > 1:
|
|
||||||
alias_list = meta_list[0].text.strip().split(", ")
|
|
||||||
|
|
||||||
# 头像
|
|
||||||
pic = ''
|
|
||||||
avatar = soup.find("div", class_="column actor-avatar")
|
|
||||||
if avatar:
|
|
||||||
pic = parse_avatar_image(avatar)
|
|
||||||
|
|
||||||
# 返回数据
|
|
||||||
actor = {}
|
|
||||||
|
|
||||||
# 使用正则表达式查找 class 包含 'movie-list h cols-4' 的 div 元素
|
|
||||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-'))
|
|
||||||
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
|
||||||
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
|
|
||||||
if not div_movies:
|
|
||||||
logging.warning(f"Warning: No movies div found ")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# 解析元素
|
|
||||||
rows = div_movies.find_all('div', class_='item')
|
|
||||||
|
|
||||||
list_data = []
|
|
||||||
next_url = None
|
|
||||||
for row in rows:
|
|
||||||
link = row.find('a', class_='box')['href']
|
|
||||||
serial_number = row.find('strong').text.strip()
|
|
||||||
title = row.find('div', class_='video-title').text.strip()
|
|
||||||
release_date = row.find('div', class_='meta').text.strip()
|
|
||||||
list_data.append({
|
|
||||||
'href' : host_url + link if link else '',
|
|
||||||
'serial_number' : serial_number,
|
|
||||||
'title' : title,
|
|
||||||
'release_date': release_date
|
|
||||||
})
|
|
||||||
|
|
||||||
# 查找 "下一页" 按钮
|
|
||||||
next_page_element = soup.find('a', class_='pagination-next')
|
|
||||||
if next_page_element:
|
|
||||||
next_page_url = next_page_element['href']
|
|
||||||
next_page_number = url_page_num(next_page_url)
|
|
||||||
current_page_number = url_page_num(href)
|
|
||||||
logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
|
|
||||||
if current_page_number is None:
|
|
||||||
current_page_number = 0
|
|
||||||
if next_page_number and next_page_number > current_page_number :
|
|
||||||
next_url = host_url + next_page_url
|
|
||||||
|
|
||||||
actor = {
|
|
||||||
'pic' : pic,
|
|
||||||
'alias' : alias_list,
|
|
||||||
'movies' : list_data
|
|
||||||
}
|
|
||||||
|
|
||||||
return actor, next_url
|
|
||||||
|
|
||||||
|
|
||||||
# 解析单个元素
|
|
||||||
def parse_movie_one(soup, keys):
|
|
||||||
key_strong = soup.find('strong', string=lambda text: text in keys)
|
|
||||||
if key_strong:
|
|
||||||
key_span = key_strong.find_next_sibling('span', class_='value')
|
|
||||||
if key_span:
|
|
||||||
return key_span.text.strip()
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 解析值和链接
|
|
||||||
def parse_movie_val_href(soup, keys):
|
|
||||||
key_strong = soup.find('strong', string=lambda text: text in keys)
|
|
||||||
if key_strong:
|
|
||||||
key_span = key_strong.find_next_sibling('span', class_='value')
|
|
||||||
if key_span:
|
|
||||||
a_tag = key_span.find('a')
|
|
||||||
if a_tag:
|
|
||||||
return a_tag.text.strip(), host_url + a_tag.get('href')
|
|
||||||
else:
|
|
||||||
return key_span.text.strip(), None
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# 解析多个值和链接
|
|
||||||
def parse_movie_arr(soup, keys):
|
|
||||||
key_strong = soup.find('strong', string=lambda text: text in keys)
|
|
||||||
if key_strong:
|
|
||||||
key_span = key_strong.find_next_sibling('span', class_='value')
|
|
||||||
if key_span:
|
|
||||||
actors = []
|
|
||||||
a_tags = key_span.find_all('a')
|
|
||||||
for a_tag in a_tags:
|
|
||||||
actors.append({
|
|
||||||
'name': a_tag.text.strip(),
|
|
||||||
'href': host_url + a_tag.get('href')
|
|
||||||
})
|
|
||||||
return actors
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_movie_detail(soup, href, title):
|
|
||||||
div_video = soup.find("div", class_='video-meta-panel')
|
|
||||||
if not div_video:
|
|
||||||
logging.warning(f"Warning: No movies div found ")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
result = {}
|
|
||||||
result['href'] = href
|
|
||||||
result['title'] = title
|
|
||||||
|
|
||||||
# 获取封面图片
|
|
||||||
cover_img = soup.select_one('.column-video-cover a')
|
|
||||||
result['cover_url'] = cover_img['href'] if cover_img else None
|
|
||||||
|
|
||||||
# 获取番号
|
|
||||||
result['serial_number'] = parse_movie_one(soup, ['番號:', 'ID:'])
|
|
||||||
result['release_date'] = parse_movie_one(soup, ['日期:', 'Released Date:'])
|
|
||||||
result['duration'] = parse_movie_one(soup, ['時長:', 'Duration:'])
|
|
||||||
|
|
||||||
# 获取maker,系列
|
|
||||||
result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:'])
|
|
||||||
result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:'])
|
|
||||||
result['pub_name'], result['pub_link'] = parse_movie_val_href(soup, ['發行:', 'Publisher:'])
|
|
||||||
|
|
||||||
# 获取演员,tags
|
|
||||||
result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:'])
|
|
||||||
result['actors'] = parse_movie_arr(soup, ['演員:', 'Actor(s):'])
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_series_uncensored(soup, href):
|
|
||||||
div_series = soup.find("div", id='series')
|
|
||||||
if not div_series:
|
|
||||||
logging.warning(f"Warning: No div_series div found ")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# 解析元素
|
|
||||||
rows = div_series.find_all('a', class_='box')
|
|
||||||
|
|
||||||
list_data = []
|
|
||||||
next_url = None
|
|
||||||
for row in rows:
|
|
||||||
name = row.find('strong').text.strip()
|
|
||||||
href = row['href']
|
|
||||||
div_movies = row.find('span')
|
|
||||||
movies = 0
|
|
||||||
if div_movies:
|
|
||||||
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
|
||||||
if match:
|
|
||||||
movies = int(match.group(1))
|
|
||||||
|
|
||||||
list_data.append({
|
|
||||||
'name' : name,
|
|
||||||
'href' : host_url + href if href else '',
|
|
||||||
'movies' : movies
|
|
||||||
})
|
|
||||||
|
|
||||||
# 查找 "下一页" 按钮
|
|
||||||
next_page_element = soup.find('a', class_='pagination-next')
|
|
||||||
if next_page_element:
|
|
||||||
next_page_url = next_page_element['href']
|
|
||||||
next_page_number = url_page_num(next_page_url)
|
|
||||||
current_page_number = url_page_num(href)
|
|
||||||
if current_page_number is None:
|
|
||||||
current_page_number = 0
|
|
||||||
if next_page_number and next_page_number > current_page_number :
|
|
||||||
next_url = host_url + next_page_url
|
|
||||||
|
|
||||||
return list_data, next_url
|
|
||||||
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_series_detail(soup, href):
|
|
||||||
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
|
||||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
|
||||||
if not div_movies:
|
|
||||||
logging.warning(f"Warning: No movies div found ")
|
|
||||||
return [], None
|
|
||||||
|
|
||||||
# 解析元素
|
|
||||||
rows = div_movies.find_all('div', class_='item')
|
|
||||||
|
|
||||||
list_data = []
|
|
||||||
next_url = None
|
|
||||||
for row in rows:
|
|
||||||
link = row.find('a', class_='box')['href']
|
|
||||||
serial_number = row.find('strong').text.strip()
|
|
||||||
title = row.find('div', class_='video-title').text.strip()
|
|
||||||
release_date = row.find('div', class_='meta').text.strip()
|
|
||||||
list_data.append({
|
|
||||||
'href' : host_url + link if link else '',
|
|
||||||
'serial_number' : serial_number,
|
|
||||||
'title' : title,
|
|
||||||
'release_date': release_date
|
|
||||||
})
|
|
||||||
|
|
||||||
# 查找 "下一页" 按钮
|
|
||||||
next_page_element = soup.find('a', class_='pagination-next')
|
|
||||||
if next_page_element:
|
|
||||||
next_page_url = next_page_element['href']
|
|
||||||
next_page_number = url_page_num(next_page_url)
|
|
||||||
current_page_number = url_page_num(href)
|
|
||||||
if current_page_number is None:
|
|
||||||
current_page_number = 0
|
|
||||||
if next_page_number and next_page_number > current_page_number :
|
|
||||||
next_url = host_url + next_page_url
|
|
||||||
|
|
||||||
return list_data, next_url
|
|
||||||
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_makers_uncensored(soup, href):
|
|
||||||
div_series = soup.find("div", id='makers')
|
|
||||||
if not div_series:
|
|
||||||
logging.warning(f"Warning: No makers div found ")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# 解析元素
|
|
||||||
rows = div_series.find_all('a', class_='box')
|
|
||||||
|
|
||||||
list_data = []
|
|
||||||
next_url = None
|
|
||||||
for row in rows:
|
|
||||||
name = row.find('strong').text.strip()
|
|
||||||
href = row['href']
|
|
||||||
div_movies = row.find('span')
|
|
||||||
movies = 0
|
|
||||||
if div_movies:
|
|
||||||
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
|
||||||
if match:
|
|
||||||
movies = int(match.group(1))
|
|
||||||
|
|
||||||
list_data.append({
|
|
||||||
'name' : name,
|
|
||||||
'href' : host_url + href if href else '',
|
|
||||||
'movies' : movies
|
|
||||||
})
|
|
||||||
|
|
||||||
# 查找 "下一页" 按钮
|
|
||||||
next_page_element = soup.find('a', class_='pagination-next')
|
|
||||||
if next_page_element:
|
|
||||||
next_page_url = next_page_element['href']
|
|
||||||
next_page_number = url_page_num(next_page_url)
|
|
||||||
current_page_number = url_page_num(href)
|
|
||||||
if current_page_number is None:
|
|
||||||
current_page_number = 0
|
|
||||||
if next_page_number and next_page_number > current_page_number :
|
|
||||||
next_url = host_url + next_page_url
|
|
||||||
|
|
||||||
return list_data, next_url
|
|
||||||
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_maker_detail(soup, href):
|
|
||||||
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
|
||||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
|
||||||
if not div_movies:
|
|
||||||
logging.warning(f"Warning: No movies div found ")
|
|
||||||
return [], None
|
|
||||||
|
|
||||||
# 解析元素
|
|
||||||
rows = div_movies.find_all('div', class_='item')
|
|
||||||
|
|
||||||
list_data = []
|
|
||||||
next_url = None
|
|
||||||
for row in rows:
|
|
||||||
link = row.find('a', class_='box')['href']
|
|
||||||
serial_number = row.find('strong').text.strip()
|
|
||||||
title = row.find('div', class_='video-title').text.strip()
|
|
||||||
release_date = row.find('div', class_='meta').text.strip()
|
|
||||||
list_data.append({
|
|
||||||
'href' : host_url + link if link else '',
|
|
||||||
'serial_number' : serial_number,
|
|
||||||
'title' : title,
|
|
||||||
'release_date': release_date
|
|
||||||
})
|
|
||||||
|
|
||||||
# 查找 "下一页" 按钮
|
|
||||||
next_page_element = soup.find('a', class_='pagination-next')
|
|
||||||
if next_page_element:
|
|
||||||
next_page_url = next_page_element['href']
|
|
||||||
next_page_number = url_page_num(next_page_url)
|
|
||||||
current_page_number = url_page_num(href)
|
|
||||||
if current_page_number is None:
|
|
||||||
current_page_number = 0
|
|
||||||
if next_page_number and next_page_number > current_page_number :
|
|
||||||
next_url = host_url + next_page_url
|
|
||||||
|
|
||||||
return list_data, next_url
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_publisher_detail(soup, href):
|
|
||||||
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
|
||||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
|
||||||
if not div_movies:
|
|
||||||
logging.warning(f"Warning: No movies div found ")
|
|
||||||
return [], None
|
|
||||||
|
|
||||||
# 解析元素
|
|
||||||
rows = div_movies.find_all('div', class_='item')
|
|
||||||
|
|
||||||
list_data = []
|
|
||||||
next_url = None
|
|
||||||
for row in rows:
|
|
||||||
link = row.find('a', class_='box')['href']
|
|
||||||
serial_number = row.find('strong').text.strip()
|
|
||||||
title = row.find('div', class_='video-title').text.strip()
|
|
||||||
release_date = row.find('div', class_='meta').text.strip()
|
|
||||||
list_data.append({
|
|
||||||
'href' : host_url + link if link else '',
|
|
||||||
'serial_number' : serial_number,
|
|
||||||
'title' : title,
|
|
||||||
'release_date': release_date
|
|
||||||
})
|
|
||||||
|
|
||||||
# 查找 "下一页" 按钮
|
|
||||||
next_page_element = soup.find('a', class_='pagination-next')
|
|
||||||
if next_page_element:
|
|
||||||
next_page_url = next_page_element['href']
|
|
||||||
next_page_number = url_page_num(next_page_url)
|
|
||||||
current_page_number = url_page_num(href)
|
|
||||||
if current_page_number is None:
|
|
||||||
current_page_number = 0
|
|
||||||
if next_page_number and next_page_number > current_page_number :
|
|
||||||
next_url = host_url + next_page_url
|
|
||||||
|
|
||||||
return list_data, next_url
|
|
||||||
|
|
||||||
|
|
||||||
# 解析 HTML 内容,提取需要的数据
|
|
||||||
def parse_uncensored(soup, href):
|
|
||||||
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
|
|
||||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
|
||||||
if not div_movies:
|
|
||||||
logging.warning(f"Warning: No movies div found ")
|
|
||||||
return [], None
|
|
||||||
|
|
||||||
# 解析元素
|
|
||||||
rows = div_movies.find_all('div', class_='item')
|
|
||||||
|
|
||||||
list_data = []
|
|
||||||
next_url = None
|
|
||||||
for row in rows:
|
|
||||||
link = row.find('a', class_='box')['href']
|
|
||||||
serial_number = row.find('strong').text.strip()
|
|
||||||
title = row.find('div', class_='video-title').text.strip()
|
|
||||||
release_date = row.find('div', class_='meta').text.strip()
|
|
||||||
list_data.append({
|
|
||||||
'href' : host_url + link if link else '',
|
|
||||||
'serial_number' : serial_number,
|
|
||||||
'title' : title,
|
|
||||||
'release_date': release_date
|
|
||||||
})
|
|
||||||
|
|
||||||
# 查找 "下一页" 按钮
|
|
||||||
next_page_element = soup.find('a', class_='pagination-next')
|
|
||||||
if next_page_element:
|
|
||||||
next_page_url = next_page_element['href']
|
|
||||||
next_page_number = url_page_num(next_page_url)
|
|
||||||
current_page_number = url_page_num(href)
|
|
||||||
if current_page_number is None:
|
|
||||||
current_page_number = 0
|
|
||||||
if next_page_number and next_page_number > current_page_number :
|
|
||||||
next_url = host_url + next_page_url
|
|
||||||
|
|
||||||
return list_data, next_url
|
|
||||||
|
|
||||||
|
|
||||||
def pretty_print_json(data, n=10, indent=4, sort_keys=False):
|
|
||||||
"""
|
|
||||||
以美化格式打印数组的前n个元素,其他元素用"..."表示
|
|
||||||
|
|
||||||
参数:
|
|
||||||
- data: 要打印的数据(应为数组)
|
|
||||||
- n: 要显示的元素数量
|
|
||||||
- indent: 缩进空格数
|
|
||||||
- sort_keys: 是否按键排序
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# 处理非数组数据
|
|
||||||
if not isinstance(data, list):
|
|
||||||
print(formatted)
|
|
||||||
return
|
|
||||||
|
|
||||||
# 复制原始数据,避免修改原数组
|
|
||||||
data_copy = data.copy()
|
|
||||||
|
|
||||||
# 切片取前n个元素
|
|
||||||
first_n_elements = data_copy[:n]
|
|
||||||
|
|
||||||
# 如果数组长度超过n,添加"..."标记
|
|
||||||
if len(data) > n:
|
|
||||||
result = first_n_elements + ["... ({} more elements)".format(len(data) - n)]
|
|
||||||
else:
|
|
||||||
result = first_n_elements
|
|
||||||
|
|
||||||
# 格式化输出
|
|
||||||
formatted = json.dumps(result, indent=indent, ensure_ascii=False, sort_keys=sort_keys)
|
|
||||||
print(formatted)
|
|
||||||
|
|
||||||
except TypeError as e:
|
|
||||||
print(f"错误:无法格式化数据。详情:{e}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"打印时发生意外错误:{e}")
|
|
||||||
|
|
||||||
def test_actor_list(url='https://www.javbus.com/uncensored/actresses/1'):
|
|
||||||
next_url = url
|
|
||||||
all_data = []
|
|
||||||
while next_url:
|
|
||||||
print(f'fetching page {next_url}')
|
|
||||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"),max_retries=1, headers=headers, cookies=cookies)
|
|
||||||
if soup:
|
|
||||||
list_data, next_url = parse_actors_list(soup, next_url)
|
|
||||||
if list_data :
|
|
||||||
all_data.extend(list_data)
|
|
||||||
pretty_print_json(all_data)
|
|
||||||
else:
|
|
||||||
print('get wrong page.')
|
|
||||||
|
|
||||||
if next_url:
|
|
||||||
print(f"\n\nnext url: {next_url}")
|
|
||||||
else:
|
|
||||||
print(f"wrong request. url: {next_url}, status_code: {status_code}")
|
|
||||||
|
|
||||||
break
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
#test_actors_list()
|
|
||||||
#test_actor()
|
|
||||||
#test_movie_detail()
|
|
||||||
#test_series_list()
|
|
||||||
#test_series_detail()
|
|
||||||
logging.getLogger().setLevel(logging.DEBUG)
|
|
||||||
test_actor_list()
|
|
||||||
test_actor_list('https://www.javbus.com/en/actresses')
|
|
||||||
|
|
||||||
@ -1,121 +0,0 @@
|
|||||||
import sqlite3
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
from datetime import datetime
|
|
||||||
import src.config.config as config
|
|
||||||
|
|
||||||
# 连接 SQLite 数据库
|
|
||||||
DB_PATH = f"{config.global_share_data_dir}/sqlite/shared.db" # 替换为你的数据库文件
|
|
||||||
|
|
||||||
# 检查 SQLite 版本
|
|
||||||
lower_sqlite_version = False
|
|
||||||
sqlite_version = sqlite3.sqlite_version_info
|
|
||||||
if sqlite_version < (3, 24, 0):
|
|
||||||
lower_sqlite_version = True
|
|
||||||
|
|
||||||
# 获取表的列名和默认值
|
|
||||||
def get_table_columns_and_defaults(cursor, tbl_name):
|
|
||||||
try:
|
|
||||||
cursor.execute(f"PRAGMA table_info({tbl_name})")
|
|
||||||
columns = cursor.fetchall()
|
|
||||||
column_info = {}
|
|
||||||
for col in columns:
|
|
||||||
col_name = col[1]
|
|
||||||
default_value = col[4]
|
|
||||||
column_info[col_name] = default_value
|
|
||||||
return column_info
|
|
||||||
except sqlite3.Error as e:
|
|
||||||
logging.error(f"Error getting table columns: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 检查并处理数据
|
|
||||||
def check_and_process_data(cursor, data, tbl_name):
|
|
||||||
column_info = get_table_columns_and_defaults(cursor=cursor, tbl_name=tbl_name)
|
|
||||||
if column_info is None:
|
|
||||||
return None
|
|
||||||
processed_data = {}
|
|
||||||
for col, default in column_info.items():
|
|
||||||
if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值
|
|
||||||
continue
|
|
||||||
if col == 'updated_at': # 日期函数,用户自己指定即可
|
|
||||||
processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
if col in data:
|
|
||||||
processed_data[col] = data[col]
|
|
||||||
|
|
||||||
return processed_data
|
|
||||||
|
|
||||||
|
|
||||||
# 插入或更新数据
|
|
||||||
def insert_or_update_common(cursor, conn, data, tbl_name, uniq_key='url'):
|
|
||||||
if lower_sqlite_version:
|
|
||||||
return insert_or_update_common_lower(cursor, conn, data, tbl_name, uniq_key)
|
|
||||||
|
|
||||||
try:
|
|
||||||
processed_data = check_and_process_data(cursor, data, tbl_name)
|
|
||||||
if processed_data is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
columns = ', '.join(processed_data.keys())
|
|
||||||
values = list(processed_data.values())
|
|
||||||
placeholders = ', '.join(['?' for _ in values])
|
|
||||||
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
|
|
||||||
|
|
||||||
sql = f'''
|
|
||||||
INSERT INTO {tbl_name} ({columns})
|
|
||||||
VALUES ({placeholders})
|
|
||||||
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
|
|
||||||
'''
|
|
||||||
cursor.execute(sql, values)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
# 获取插入或更新后的 report_id
|
|
||||||
cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
|
||||||
report_id = cursor.fetchone()[0]
|
|
||||||
return report_id
|
|
||||||
except sqlite3.Error as e:
|
|
||||||
logging.error(f"Error inserting or updating data: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 插入或更新数据
|
|
||||||
def insert_or_update_common_lower(cursor, conn, data, tbl_name, uniq_key='url'):
|
|
||||||
try:
|
|
||||||
processed_data = check_and_process_data(cursor, data, tbl_name)
|
|
||||||
if processed_data is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
columns = ', '.join(processed_data.keys())
|
|
||||||
values = list(processed_data.values())
|
|
||||||
placeholders = ', '.join(['?' for _ in values])
|
|
||||||
|
|
||||||
# 先尝试插入数据
|
|
||||||
try:
|
|
||||||
sql = f'''
|
|
||||||
INSERT INTO {tbl_name} ({columns})
|
|
||||||
VALUES ({placeholders})
|
|
||||||
'''
|
|
||||||
cursor.execute(sql, values)
|
|
||||||
conn.commit()
|
|
||||||
except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作
|
|
||||||
update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
|
|
||||||
update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
|
|
||||||
update_values.append(data[uniq_key])
|
|
||||||
sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
|
|
||||||
cursor.execute(sql, update_values)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
# 获取插入或更新后的 report_id
|
|
||||||
cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
|
||||||
report_id = cursor.fetchone()[0]
|
|
||||||
return report_id
|
|
||||||
except sqlite3.Error as e:
|
|
||||||
logging.error(f"Error inserting or updating data: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# 测试代码
|
|
||||||
if __name__ == "__main__":
|
|
||||||
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
tbl_name_actors = 'javhd_models'
|
|
||||||
print(get_table_columns_and_defaults(cursor, tbl_name_actors))
|
|
||||||
File diff suppressed because it is too large
Load Diff
204
src/db_utils/sqlite_db.py
Normal file
204
src/db_utils/sqlite_db.py
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
import sqlite3
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
import src.config.config as config
|
||||||
|
|
||||||
|
default_dbpath = f"{config.global_share_data_dir}/sqlite/shared.db"
|
||||||
|
|
||||||
|
# 数据库基类,封装了通用的操作。
|
||||||
|
class DatabaseHandler:
|
||||||
|
def __init__(self, db_path=None):
|
||||||
|
# 使用传入的 db_path 或默认路径
|
||||||
|
self.DB_PATH = db_path or default_dbpath
|
||||||
|
|
||||||
|
# 验证路径是否存在(可选)
|
||||||
|
if db_path and not os.path.exists(os.path.dirname(db_path)):
|
||||||
|
os.makedirs(os.path.dirname(db_path))
|
||||||
|
|
||||||
|
self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
|
||||||
|
# 检查 SQLite 版本
|
||||||
|
self.lower_sqlite_version = False
|
||||||
|
sqlite_version = sqlite3.sqlite_version_info
|
||||||
|
if sqlite_version < (3, 24, 0):
|
||||||
|
self.lower_sqlite_version = True
|
||||||
|
|
||||||
|
def get_table_columns_and_defaults(self, tbl_name):
|
||||||
|
try:
|
||||||
|
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
|
||||||
|
columns = self.cursor.fetchall()
|
||||||
|
column_info = {}
|
||||||
|
for col in columns:
|
||||||
|
col_name = col[1]
|
||||||
|
default_value = col[4]
|
||||||
|
column_info[col_name] = default_value
|
||||||
|
return column_info
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error getting table columns: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def check_and_process_data(self, data, tbl_name):
|
||||||
|
column_info = self.get_table_columns_and_defaults(tbl_name)
|
||||||
|
if column_info is None:
|
||||||
|
return None
|
||||||
|
processed_data = {}
|
||||||
|
for col, default in column_info.items():
|
||||||
|
if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值
|
||||||
|
continue
|
||||||
|
if col == 'updated_at': # 日期函数,用户自己指定即可
|
||||||
|
processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
if col in data:
|
||||||
|
processed_data[col] = data[col]
|
||||||
|
|
||||||
|
return processed_data
|
||||||
|
|
||||||
|
def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
|
||||||
|
if self.lower_sqlite_version:
|
||||||
|
return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed_data = self.check_and_process_data(data, tbl_name)
|
||||||
|
if processed_data is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
columns = ', '.join(processed_data.keys())
|
||||||
|
values = list(processed_data.values())
|
||||||
|
placeholders = ', '.join(['?' for _ in values])
|
||||||
|
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
|
||||||
|
|
||||||
|
sql = f'''
|
||||||
|
INSERT INTO {tbl_name} ({columns})
|
||||||
|
VALUES ({placeholders})
|
||||||
|
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
|
||||||
|
'''
|
||||||
|
self.cursor.execute(sql, values)
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
# 获取插入或更新后的 report_id
|
||||||
|
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
||||||
|
report_id = self.cursor.fetchone()[0]
|
||||||
|
return report_id
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
|
||||||
|
try:
|
||||||
|
processed_data = self.check_and_process_data(data, tbl_name)
|
||||||
|
if processed_data is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
columns = ', '.join(processed_data.keys())
|
||||||
|
values = list(processed_data.values())
|
||||||
|
placeholders = ', '.join(['?' for _ in values])
|
||||||
|
|
||||||
|
# 先尝试插入数据
|
||||||
|
try:
|
||||||
|
sql = f'''
|
||||||
|
INSERT INTO {tbl_name} ({columns})
|
||||||
|
VALUES ({placeholders})
|
||||||
|
'''
|
||||||
|
self.cursor.execute(sql, values)
|
||||||
|
self.conn.commit()
|
||||||
|
except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作
|
||||||
|
update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
|
||||||
|
update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
|
||||||
|
update_values.append(data[uniq_key])
|
||||||
|
sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
|
||||||
|
self.cursor.execute(sql, update_values)
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
# 获取插入或更新后的 report_id
|
||||||
|
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
||||||
|
report_id = self.cursor.fetchone()[0]
|
||||||
|
return report_id
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def insert_task_log(self):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def update_task_log(self, task_id, task_status):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def finalize_task_log(self, task_id):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.cursor.close()
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
# javbus 类
|
||||||
|
class JavbusDBHandler(DatabaseHandler):
|
||||||
|
def __init__(self, db_path=None):
|
||||||
|
super().__init__(db_path)
|
||||||
|
self.tbl_name_actors = 'javbus_actors'
|
||||||
|
|
||||||
|
def insert_actor_index(self, data, uncensored=0, from_actor_list=0, from_movie_list=0):
|
||||||
|
data['uncensored'] = uncensored
|
||||||
|
if from_actor_list:
|
||||||
|
data['from_actor_list'] = from_actor_list
|
||||||
|
if from_movie_list:
|
||||||
|
data['from_movie_list'] = from_movie_list
|
||||||
|
try:
|
||||||
|
return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href')
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def update_actor_detail(self, data, is_full_data=1):
|
||||||
|
try:
|
||||||
|
data['is_full_data'] = is_full_data
|
||||||
|
return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href')
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def query_actors(self, **filters):
|
||||||
|
try:
|
||||||
|
sql = f"SELECT url, en_name as name FROM {self.tbl_name_actors} WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
conditions = {
|
||||||
|
"id": " AND id = ?",
|
||||||
|
"url": " AND href = ?",
|
||||||
|
"en_name": " AND name LIKE ?",
|
||||||
|
"is_full_data": " AND is_full_data = ?",
|
||||||
|
"start_id": " AND id > ?",
|
||||||
|
}
|
||||||
|
|
||||||
|
for key, condition in conditions.items():
|
||||||
|
if key in filters:
|
||||||
|
sql += condition
|
||||||
|
if key == "en_name":
|
||||||
|
params.append(f"%{filters[key]}%")
|
||||||
|
else:
|
||||||
|
params.append(filters[key])
|
||||||
|
|
||||||
|
for key in ["is_full_data_in", "is_full_data_not_in"]:
|
||||||
|
if key in filters:
|
||||||
|
values = filters[key]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
operator = "IN" if key == "is_full_data_in" else "NOT IN"
|
||||||
|
sql += f" AND is_full_data {operator} ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
|
||||||
|
if "order_by" in filters:
|
||||||
|
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
|
||||||
|
sql += f" ORDER BY {filters['order_by']} "
|
||||||
|
|
||||||
|
if 'limit' in filters:
|
||||||
|
sql += " LIMIT ?"
|
||||||
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
self.cursor.execute(sql, params)
|
||||||
|
return [{'url': row[0], 'name': row[1]} for row in self.cursor.fetchall()]
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
@ -9,12 +9,13 @@ from functools import partial
|
|||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
import src.config.config as config
|
import src.config.config as config
|
||||||
import src.logger.logger as logger
|
import src.logger.logger as logger
|
||||||
import src.db_utils.db_javbus as db_tools
|
import src.db_utils.sqlite_db as sqlite_db
|
||||||
import src.crawling.craw_common as scraper_base
|
import src.crawling.craw as craw
|
||||||
import src.crawling.craw_javbus as scraper
|
|
||||||
import src.utils.utils as utils
|
import src.utils.utils as utils
|
||||||
|
|
||||||
logger.setup_logging()
|
logger.setup_logging()
|
||||||
|
db_tools = sqlite_db.JavbusDBHandler()
|
||||||
|
scraper = craw.JavbusCrawler()
|
||||||
|
|
||||||
debug = False
|
debug = False
|
||||||
skip_local = False
|
skip_local = False
|
||||||
@ -34,7 +35,7 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
|||||||
num = 1
|
num = 1
|
||||||
while current_url:
|
while current_url:
|
||||||
logging.info(f"fetching url {current_url}")
|
logging.info(f"fetching url {current_url}")
|
||||||
soup, status_code = scraper_base.fetch_page(current_url, partial(scraper_base.generic_validator, tag="div", identifier="waterfall", attr_type="id"), headers=scraper.headers, cookies=scraper.cookies)
|
soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||||
if soup:
|
if soup:
|
||||||
list_data, current_url = scraper.parse_actors_list(soup, current_url)
|
list_data, current_url = scraper.parse_actors_list(soup, current_url)
|
||||||
if list_data :
|
if list_data :
|
||||||
@ -50,9 +51,12 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
|||||||
else:
|
else:
|
||||||
logging.warning(f'fetch actor error. {current_url} ...')
|
logging.warning(f'fetch actor error. {current_url} ...')
|
||||||
|
|
||||||
elif status_code and status_code == 404:
|
elif status_code :
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {current_url}')
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {current_url}')
|
||||||
break
|
break
|
||||||
|
else: # 达到失败上限,加上休眠继续重试
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
time.sleep(0.3)
|
time.sleep(0.3)
|
||||||
|
|
||||||
# 调试break
|
# 调试break
|
||||||
@ -62,36 +66,13 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
|||||||
# 获取演员列表
|
# 获取演员列表
|
||||||
def fetch_actor_list():
|
def fetch_actor_list():
|
||||||
#for lang in ["en", "ja", "zh"]:
|
#for lang in ["en", "ja", "zh"]:
|
||||||
for lang in ['ja']:
|
for lang in ['en']:
|
||||||
fetch_actor_list_lang(lang=lang, uncensored=1)
|
fetch_actor_list_lang(lang=lang, uncensored=1)
|
||||||
|
|
||||||
#for lang in ["en", "ja", "zh"]:
|
#for lang in ["en", "ja", "zh"]:
|
||||||
for lang in ['ja']:
|
for lang in ['en']:
|
||||||
fetch_actor_list_lang(lang=lang)
|
fetch_actor_list_lang(lang=lang)
|
||||||
|
|
||||||
|
|
||||||
# 获取演员列表
|
|
||||||
def fetch_actor_list2():
|
|
||||||
next_url = scraper.actors_uncensored_base_url
|
|
||||||
while next_url:
|
|
||||||
logging.info(f'fetching page {next_url}')
|
|
||||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
|
||||||
if soup:
|
|
||||||
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
|
|
||||||
if list_data :
|
|
||||||
# 写入数据库
|
|
||||||
for row in list_data:
|
|
||||||
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
|
|
||||||
if actor_id:
|
|
||||||
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'fetch actor error. {next_url} ...')
|
|
||||||
elif status_code and status_code == 404:
|
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
|
||||||
break
|
|
||||||
|
|
||||||
# 获取makers列表
|
# 获取makers列表
|
||||||
def fetch_makers_list():
|
def fetch_makers_list():
|
||||||
next_url = scraper.makers_uncensored_base_url
|
next_url = scraper.makers_uncensored_base_url
|
||||||
|
|||||||
Reference in New Issue
Block a user