modify scripts
This commit is contained in:
@ -136,6 +136,7 @@ if [ "${PERIOD}" = "--monthly" ]; then
|
||||
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||
register_spider "javhd" "scrapy crawl javhd -a mod='update' "
|
||||
register_spider "lord" "scrapy crawl lord -a mod='update' "
|
||||
register_spider "javbus" "scrapy crawl javbus -a cmd='actors' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ "
|
||||
fi
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,7 @@ home_dir = os.path.expanduser("~")
|
||||
global_share_data_dir = f'{home_dir}/sharedata'
|
||||
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
|
||||
shared_db_path = f"{global_share_data_dir}/sqlite/shared.db"
|
||||
test_db_path = f"{global_share_data_dir}/sqlite/test.db"
|
||||
|
||||
# 单例元类
|
||||
class SingletonMeta(type):
|
||||
|
||||
@ -35,55 +35,6 @@ class Sis001Item(scrapy.Item):
|
||||
size_gb = scrapy.Field()
|
||||
update_date = scrapy.Field()
|
||||
|
||||
class IAFDPersonItem(scrapy.Item):
|
||||
item_type = comm.ITEM_TYPE_ACTOR_INDEX
|
||||
name = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
from_astro_list = scrapy.Field()
|
||||
from_birth_list = scrapy.Field()
|
||||
from_ethnic_list = scrapy.Field()
|
||||
from_movie_list = scrapy.Field()
|
||||
|
||||
class IAFDMovieItem(scrapy.Item):
|
||||
item_type = comm.ITEM_TYPE_MOVIE_INDEX
|
||||
title = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
release_year = scrapy.Field()
|
||||
from_performer_list = scrapy.Field()
|
||||
from_dist_list = scrapy.Field()
|
||||
from_stu_list = scrapy.Field()
|
||||
|
||||
class IAFDPersonDetailItem(scrapy.Item):
|
||||
item_type = comm.ITEM_TYPE_ACTOR_DETAIL
|
||||
href = scrapy.Field()
|
||||
person = scrapy.Field()
|
||||
gender = scrapy.Field()
|
||||
birthday = scrapy.Field()
|
||||
astrology = scrapy.Field()
|
||||
birthplace = scrapy.Field()
|
||||
years_active = scrapy.Field()
|
||||
ethnicity = scrapy.Field()
|
||||
nationality = scrapy.Field()
|
||||
hair_colors = scrapy.Field()
|
||||
eye_color = scrapy.Field()
|
||||
height = scrapy.Field()
|
||||
weight = scrapy.Field()
|
||||
measurements = scrapy.Field()
|
||||
tattoos = scrapy.Field()
|
||||
piercings = scrapy.Field()
|
||||
movies_cnt = scrapy.Field()
|
||||
vixen_cnt = scrapy.Field()
|
||||
blacked_cnt = scrapy.Field()
|
||||
tushy_cnt = scrapy.Field()
|
||||
x_art_cnt = scrapy.Field()
|
||||
performer_aka = scrapy.Field()
|
||||
|
||||
class IAFDMovieDetailItem(scrapy.Item):
|
||||
item_type = comm.ITEM_TYPE_MOVIE_DETAIL
|
||||
title = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
# 可以根据实际需求添加更多影片详情字段
|
||||
|
||||
class PBoxStuItem(scrapy.Item):
|
||||
item_type = scrapy.Field()
|
||||
label_id = scrapy.Field()
|
||||
@ -228,10 +179,12 @@ class IafdDistributorsItem(scrapy.Item):
|
||||
href = scrapy.Field()
|
||||
parent_id = scrapy.Field()
|
||||
details = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
|
||||
class IafdMetaEthnicItem(scrapy.Item):
|
||||
name = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
|
||||
class IafdMoviesItem(scrapy.Item):
|
||||
title = scrapy.Field()
|
||||
@ -251,21 +204,35 @@ class IafdMoviesItem(scrapy.Item):
|
||||
from_performer_list = scrapy.Field()
|
||||
from_dist_list = scrapy.Field()
|
||||
from_stu_list = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
Directors = scrapy.Field()
|
||||
Distributor = scrapy.Field()
|
||||
DistributorHref = scrapy.Field()
|
||||
Studio = scrapy.Field()
|
||||
StudioHref = scrapy.Field()
|
||||
Director = scrapy.Field()
|
||||
DirectorHref = scrapy.Field()
|
||||
Performers = scrapy.Field()
|
||||
SceneBreakdowns = scrapy.Field()
|
||||
AppearsIn = scrapy.Field()
|
||||
|
||||
class IafdMoviesAppersInItem(scrapy.Item):
|
||||
movie_id = scrapy.Field()
|
||||
appears_in_id = scrapy.Field()
|
||||
gradation = scrapy.Field()
|
||||
notes = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
|
||||
class IafdPerformerAliasesItem(scrapy.Item):
|
||||
performer_id = scrapy.Field()
|
||||
alias = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
|
||||
class IafdPerformerUrlsItem(scrapy.Item):
|
||||
performer_id = scrapy.Field()
|
||||
position = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
|
||||
class IafdPerformersItem(scrapy.Item):
|
||||
name = scrapy.Field()
|
||||
@ -299,18 +266,23 @@ class IafdPerformersItem(scrapy.Item):
|
||||
from_birth_list = scrapy.Field()
|
||||
from_ethnic_list = scrapy.Field()
|
||||
from_movie_list = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
credits = scrapy.Field()
|
||||
performer_aka = scrapy.Field()
|
||||
|
||||
class IafdPerformersMoviesItem(scrapy.Item):
|
||||
performer_id = scrapy.Field()
|
||||
movie_id = scrapy.Field()
|
||||
role = scrapy.Field()
|
||||
notes = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
|
||||
class IafdStudiosItem(scrapy.Item):
|
||||
name = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
parent_id = scrapy.Field()
|
||||
details = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
|
||||
class IafdTaskLogItem(scrapy.Item):
|
||||
task_id = scrapy.Field()
|
||||
@ -321,6 +293,7 @@ class IafdTaskLogItem(scrapy.Item):
|
||||
total_distributors = scrapy.Field()
|
||||
total_studios = scrapy.Field()
|
||||
task_status = scrapy.Field()
|
||||
# 以下为添加字段
|
||||
|
||||
class JavbusActorsItem(scrapy.Item):
|
||||
ja_name = scrapy.Field()
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
# return item
|
||||
import json
|
||||
import scrapy
|
||||
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem, PBoxStuItem
|
||||
from scrapy_proj.items import U001Item, Sis001Item, PBoxStuItem
|
||||
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler, PboxDBHandler
|
||||
|
||||
class SQLitePipeline():
|
||||
|
||||
@ -6,6 +6,16 @@ from twisted.internet import reactor, defer, asyncioreactor
|
||||
import time
|
||||
|
||||
class BaseSpider(scrapy.Spider):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.requested_url = set()
|
||||
|
||||
# 记录本次任务已经发起的请求链接
|
||||
def _can_request(self, href):
|
||||
if href in self.requested_url:
|
||||
return False
|
||||
self.requested_url.add(href)
|
||||
return True
|
||||
|
||||
def start_requests(self):
|
||||
"""统一处理请求生成,兼容不同入口点"""
|
||||
# 如果定义了async start方法,使用它
|
||||
|
||||
@ -3,11 +3,11 @@ import re
|
||||
import sys
|
||||
from urllib.parse import urljoin, quote_plus
|
||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
||||
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
||||
from scrapy_proj.items import IafdDistributorsItem, IafdMetaEthnicItem, IafdMoviesItem, IafdPerformersItem, IafdStudiosItem
|
||||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
||||
from scrapy_proj.spiders.parser.iafd_parser import common_parser
|
||||
from scrapy_proj.utils.utils import pretty_json_simple
|
||||
from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url
|
||||
|
||||
db_tools = IAFDDBHandler()
|
||||
|
||||
@ -40,8 +40,19 @@ class IAFDSpider(BaseSpider):
|
||||
if cmd and cmd != '':
|
||||
self.cmd_list = cmd.split(',')
|
||||
|
||||
self.existed_actors = {}
|
||||
self.existed_movies = {}
|
||||
self.load_existed_actors()
|
||||
self.load_existed_movies()
|
||||
|
||||
# 入口函数,由基类的方法触发
|
||||
def custom_start_requests(self):
|
||||
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/actor_404", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/movie_404", 0)
|
||||
# 根据命令字执行
|
||||
if self.cmd_astro in self.cmd_list:
|
||||
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
|
||||
@ -117,59 +128,65 @@ class IAFDSpider(BaseSpider):
|
||||
async for request in super().start():
|
||||
yield request
|
||||
|
||||
# 获得列表,查询详情
|
||||
def parse_astro_page(self, response):
|
||||
astro = response.meta.get('astro', '')
|
||||
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||
for item in data:
|
||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
|
||||
# 获得列表,查询详情
|
||||
def parse_birth_page(self, response):
|
||||
month = response.meta['month']
|
||||
day = response.meta['day']
|
||||
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||
for item in data:
|
||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
|
||||
# 获得列表,查询详情
|
||||
def parse_ethnic_list_page(self, response):
|
||||
div_root = response.css('select#ethnicity1')
|
||||
if div_root:
|
||||
options = div_root.css('option')
|
||||
self.crawler.stats.set_value(f"{self.name}/ethnic_all", len(options))
|
||||
self.crawler.stats.set_value(f"{self.name}/ethnic_done", 0)
|
||||
for option in options:
|
||||
href = option.attrib.get('value')
|
||||
text = option.css('::text').get().strip()
|
||||
if href and href.lower() != 'none':
|
||||
ethnic_url = urljoin(response.url , href)
|
||||
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
|
||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||
item = IafdMetaEthnicItem()
|
||||
item['name'] = text
|
||||
item['href'] = ethnic_url
|
||||
yield item
|
||||
|
||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||
else:
|
||||
self.logger.warning(f"parse page error. url: {response.url}")
|
||||
|
||||
# 获得列表,查询详情
|
||||
def parse_ethnic_page(self, response):
|
||||
ethnic = response.meta['ethnic']
|
||||
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||
for item in data:
|
||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||
|
||||
if next_url:
|
||||
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||
else:
|
||||
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
|
||||
if next_url:
|
||||
self.logger.info(f"find next page: {next_url}")
|
||||
else:
|
||||
self.logger.info(f"found all pages. url: {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
|
||||
def parse_distributors_list_page(self, response):
|
||||
select_element = response.css('select[name="Distrib"]')
|
||||
if select_element:
|
||||
@ -178,7 +195,15 @@ class IAFDSpider(BaseSpider):
|
||||
value = option.attrib.get('value')
|
||||
text = option.css('::text').get().strip()
|
||||
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
|
||||
item = IafdDistributorsItem()
|
||||
item['name'] = text
|
||||
item['href'] = dis_url
|
||||
|
||||
yield item
|
||||
|
||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||||
else:
|
||||
self.logger.warning(f"parse page error. url: {response.url}")
|
||||
|
||||
def parse_studios_list_page(self, response):
|
||||
select_element = response.css('select[name="Studio"]')
|
||||
@ -188,53 +213,156 @@ class IAFDSpider(BaseSpider):
|
||||
value = option.attrib.get('value')
|
||||
text = option.css('::text').get().strip()
|
||||
dis_url = f"{self.host_url}/studio.rme/studio={value}"
|
||||
item = IafdStudiosItem()
|
||||
item['name'] = text
|
||||
item['href'] = dis_url
|
||||
yield item
|
||||
|
||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||||
else:
|
||||
self.logger.warning(f"parse page error. url: {response.url}")
|
||||
|
||||
def parse_stu_dist_page(self, response):
|
||||
list_type = response.meta.get('list_type', '')
|
||||
data, next_url = common_parser(html=response.text, page=list_type)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
for movie in data:
|
||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
# 统一处理发起影片查询的请求
|
||||
def _create_performer_request(self, href, name):
|
||||
if href != '' and is_valid_url(href):
|
||||
if self._can_request(href):
|
||||
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
||||
yield scrapy.Request(href,
|
||||
callback=self.parse_person_detail_page,
|
||||
meta={'name': name, 'item_type':'movie'}
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"wrong url. {url}, ignore...")
|
||||
|
||||
# 统一处理发起影片查询的请求
|
||||
def _create_movie_request(self, href, title):
|
||||
if href != '' and is_valid_url(href):
|
||||
if self.need_update_movie(href) and self._can_request(href):
|
||||
self.crawler.stats.inc_value(f"{self.name}/movie_all")
|
||||
yield scrapy.Request(href,
|
||||
callback=self.parse_movie_detail_page,
|
||||
meta={'title': title, 'item_type':'movie'},
|
||||
cache=True
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"wrong url. {url}, ignore...")
|
||||
|
||||
# 演员详情页解析和处理
|
||||
def parse_person_detail_page(self, response):
|
||||
data = common_parser(html=response.text, page='actor', url=response.url)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||||
item = IafdPerformersItem()
|
||||
for k, v in data.items():
|
||||
if k in item.fields:
|
||||
item[k] = v
|
||||
|
||||
yield item
|
||||
|
||||
# 处理影片列表
|
||||
for role, movies in data.get('credits', {}).items():
|
||||
if movies:
|
||||
for item in movies:
|
||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
|
||||
item = IAFDPersonDetailItem()
|
||||
#yield item
|
||||
|
||||
# 影片详情页解析和处理
|
||||
def parse_movie_detail_page(self, response):
|
||||
title = response.meta.get('title', '')
|
||||
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/movie_done")
|
||||
item = IafdMoviesItem()
|
||||
for k, v in data.items():
|
||||
if k in item.fields:
|
||||
item[k] = v
|
||||
yield item
|
||||
|
||||
# 处理各种链接
|
||||
link_url = data.get('DistributorHref', '')
|
||||
if is_valid_url(link_url) and self._can_request(link_url):
|
||||
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||||
|
||||
link_url = data.get('StudioHref', '')
|
||||
if is_valid_url(link_url) and self._can_request(link_url):
|
||||
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||||
|
||||
link_url = data.get('DirectorHref', '')
|
||||
yield from self._create_performer_request(href=link_url, name=data.get('Director'))
|
||||
|
||||
for director in data.get('Directors', []):
|
||||
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
||||
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
|
||||
item = IAFDMovieDetailItem()
|
||||
#yield item
|
||||
|
||||
def custom_block_check(self, response):
|
||||
item_type = response.meta.get('item_type', '')
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
|
||||
return "invalid or outdated page"
|
||||
else:
|
||||
self.logger.info(f"right content. url: {response.url}")
|
||||
|
||||
return None
|
||||
# 统一判断并处理异常
|
||||
def _handle_invalid_response(self, response):
|
||||
if response.status in [200]:
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
||||
# TODO: 更新404的演员或者影片
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||
|
||||
# 处理页面异常,主要是404, 403
|
||||
def handle_blocked(self, response, reason):
|
||||
item_type = response.meta.get('item_type', '')
|
||||
if response.status in [404, 403]:
|
||||
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
|
||||
elif response.status in [404, 403]:
|
||||
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||
# TODO: 更新404的演员或者影片
|
||||
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||
|
||||
|
||||
def load_existed_actors(self):
|
||||
query_args = {}
|
||||
rows = db_tools.query_performer_hrefs(**query_args)
|
||||
if rows:
|
||||
for item in rows:
|
||||
self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']}
|
||||
else:
|
||||
self.logger.warning(f"query_performer_hrefs empty. query args: {query_args}")
|
||||
|
||||
|
||||
def load_existed_movies(self):
|
||||
query_args = {}
|
||||
rows = db_tools.query_movie_hrefs(**query_args)
|
||||
if rows:
|
||||
for item in rows:
|
||||
self.existed_movies[item['href']] = item['is_full_data']
|
||||
else:
|
||||
self.logger.warning(f"query_movies empty. query args: {query_args}")
|
||||
|
||||
# 内存缓存,也可以改为查询db
|
||||
def need_update_movie(self, href):
|
||||
return not (href in self.existed_movies and self.existed_movies[href] >0)
|
||||
|
||||
# 内存缓存,也可以改为查询db
|
||||
def need_update_actor(self, href, movies_cnt):
|
||||
if href not in self.existed_actors:
|
||||
return True
|
||||
data = self.existed_actors[href]
|
||||
if data['is_full_data'] <=0 :
|
||||
return True
|
||||
if data['movies_cnt'] < movies_cnt:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def add_actor_to_existed(self, href, movies_cnt, is_full_data=1):
|
||||
self.existed_actors[href] = {'is_full_data': is_full_data, 'movies_cnt': movies_cnt}
|
||||
|
||||
def acc_movie_to_existed(self, href, is_full_data=1):
|
||||
self.existed_movies[href] = is_full_data
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
|
||||
import cloudscraper
|
||||
import time
|
||||
import json
|
||||
@ -11,6 +10,7 @@ import re
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import RequestException
|
||||
from functools import partial
|
||||
from datetime import datetime
|
||||
#import config
|
||||
#import utils
|
||||
|
||||
@ -410,8 +410,8 @@ def parse_page_performer(soup, url):
|
||||
'nationality': 'Nationality',
|
||||
'hair_colors': 'Hair Colors',
|
||||
'eye_color': 'Eye Color',
|
||||
'height': 'Height',
|
||||
'weight': 'Weight',
|
||||
'height_str': 'Height',
|
||||
'weight_str': 'Weight',
|
||||
'measurements': 'Measurements',
|
||||
'tattoos': 'Tattoos',
|
||||
'piercings': 'Piercings'
|
||||
@ -474,6 +474,20 @@ def parse_page_performer(soup, url):
|
||||
return data
|
||||
|
||||
|
||||
def extract_year_from_date_string(date_str):
|
||||
"""
|
||||
从 "Apr 23, 2021" 格式的字符串中提取年份
|
||||
|
||||
:param date_str: 待解析的日期字符串(如 "Apr 23, 2021")
|
||||
:return: 提取的年份(int类型),若解析失败返回 None
|
||||
"""
|
||||
try:
|
||||
date_obj = datetime.strptime(date_str, "%b %d, %Y")
|
||||
return date_obj.year
|
||||
except ValueError:
|
||||
return 0
|
||||
except TypeError:
|
||||
return 0
|
||||
|
||||
# 解析网页 HTML 并提取电影信息
|
||||
def parse_page_movie(soup, href, title):
|
||||
@ -595,19 +609,20 @@ def parse_page_movie(soup, href, title):
|
||||
return {
|
||||
"href": href,
|
||||
"title": title,
|
||||
"Minutes": movie_data.get("Minutes", ""),
|
||||
"Distributor": movie_data.get("Distributor", ""),
|
||||
"Studio": movie_data.get("Studio", ""),
|
||||
"ReleaseDate": movie_data.get("Release Date", ""),
|
||||
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||||
"All-Girl": movie_data.get("All-Girl", ""),
|
||||
"All-Male": movie_data.get("All-Male", ""),
|
||||
"Compilation": movie_data.get("Compilation", ""),
|
||||
"Webscene": movie_data.get("Webscene", ""),
|
||||
"minutes": movie_data.get("Minutes", ""),
|
||||
"release_date": movie_data.get("Release Date", ""),
|
||||
"added_to_IAFD_date": movie_data.get("Date Added to IAFD", ""),
|
||||
"all_girl": movie_data.get("All-Girl", ""),
|
||||
"all_male": movie_data.get("All-Male", ""),
|
||||
"compilation": movie_data.get("Compilation", ""),
|
||||
"webscene": movie_data.get("Webscene", ""),
|
||||
'release_year': extract_year_from_date_string(movie_data.get("Release Date", "")),
|
||||
"Director": movie_data.get("Director", ""),
|
||||
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||
"Studio": movie_data.get("Studio", ""),
|
||||
"StudioHref": movie_data.get("StudioHref", ""),
|
||||
"Distributor": movie_data.get("Distributor", ""),
|
||||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||
"Directors": movie_data.get("Directors", []), # 可能存在的元素
|
||||
"Performers": performers,
|
||||
"SceneBreakdowns": scene_breakdowns,
|
||||
|
||||
Reference in New Issue
Block a user