modify scripts

This commit is contained in:
oscarz
2025-07-03 11:41:30 +08:00
parent 7a46b1bc4c
commit ff49046212
7 changed files with 608 additions and 128 deletions

View File

@ -0,0 +1,104 @@
import os
import sqlite3
import logging
from datetime import datetime
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path
class IAFDQuery(SQLiteDBHandler):
def __init__(self, db_path=shared_db_path):
super().__init__(db_path)
self.tbl_name_performers = 'iafd_performers'
self.tbl_name_movies = 'iafd_movies'
self.uniq_key = 'href'
# 按条件查询 href 列表
def get_performers(self, **filters):
try:
sql = f"SELECT href, name, id, movies_cnt FROM {self.tbl_name_performers} WHERE 1=1"
params = []
conditions = {
"id": " AND id = ?",
"href": " AND href = ?",
"name": " AND name LIKE ?",
"is_full_data": " AND is_full_data = ?",
"start_id": " AND id > ?",
}
for key, condition in conditions.items():
if key in filters:
sql += condition
if key == "name":
params.append(f"%{filters[key]}%")
else:
params.append(filters[key])
for key in ["is_full_data_in", "is_full_data_not_in"]:
if key in filters:
values = filters[key]
if values:
placeholders = ", ".join(["?"] * len(values))
operator = "IN" if key == "is_full_data_in" else "NOT IN"
sql += f" AND is_full_data {operator} ({placeholders})"
params.extend(values)
if "order_by" in filters:
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
sql += f" ORDER BY {filters['order_by']} "
if 'limit' in filters:
sql += " LIMIT ?"
params.append(filters["limit"])
self.cursor.execute(sql, params)
return [dict(row) for row in self.cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 按条件查询 href 列表
def get_movies(self, **filters):
try:
sql = f"SELECT href, title, id FROM {self.tbl_name_performers} WHERE 1=1"
params = []
conditions = {
"id": " AND id = ?",
"href": " AND href = ?",
"title": " AND title LIKE ?",
"is_full_data": " AND is_full_data = ?",
"start_id": " AND id > ?",
}
for key, condition in conditions.items():
if key in filters:
sql += condition
if key == "name":
params.append(f"%{filters[key]}%")
else:
params.append(filters[key])
for key in ["is_full_data_in", "is_full_data_not_in"]:
if key in filters:
values = filters[key]
if values:
placeholders = ", ".join(["?"] * len(values))
operator = "IN" if key == "is_full_data_in" else "NOT IN"
sql += f" AND is_full_data {operator} ({placeholders})"
params.extend(values)
if "order_by" in filters:
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
sql += f" ORDER BY {filters['order_by']} "
if 'limit' in filters:
sql += " LIMIT ?"
params.append(filters["limit"])
self.cursor.execute(sql, params)
return [dict(row) for row in self.cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None

View File

@ -0,0 +1,134 @@
import os
import sqlite3
import logging
from datetime import datetime
home_dir = os.path.expanduser("~")
global_share_data_dir = f'{home_dir}/sharedata'
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
shared_db_path = f"{global_share_data_dir}/sqlite/shared.db"
# 数据库基类,封装了通用的操作。
class SQLiteDBHandler:
def __init__(self, db_path=None):
# 使用传入的 db_path 或默认路径
self.DB_PATH = db_path or default_dbpath
# 验证路径是否存在(可选)
if db_path and not os.path.exists(os.path.dirname(db_path)):
os.makedirs(os.path.dirname(db_path))
self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
self.conn.execute('PRAGMA journal_mode = WAL') # 启用 WAL(Write-Ahead Logging) 模式
self.conn.commit()
self.conn.row_factory = sqlite3.Row # 结果集支持字典式访问
self.cursor = self.conn.cursor()
# 检查 SQLite 版本
self.lower_sqlite_version = False
sqlite_version = sqlite3.sqlite_version_info
if sqlite_version < (3, 24, 0):
self.lower_sqlite_version = True
def get_table_columns_and_defaults(self, tbl_name):
try:
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
columns = self.cursor.fetchall()
column_info = {}
for col in columns:
col_name = col[1]
default_value = col[4]
column_info[col_name] = default_value
return column_info
except sqlite3.Error as e:
logging.error(f"Error getting table columns: {e}")
return None
def check_and_process_data(self, data, tbl_name):
column_info = self.get_table_columns_and_defaults(tbl_name)
if column_info is None:
return None
processed_data = {}
for col, default in column_info.items():
if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值
continue
if col == 'updated_at': # 日期函数,用户自己指定即可
processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if col in data:
processed_data[col] = data[col]
return processed_data
def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
if self.lower_sqlite_version:
return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
try:
processed_data = self.check_and_process_data(data, tbl_name)
if processed_data is None:
return None
columns = ', '.join(processed_data.keys())
values = list(processed_data.values())
placeholders = ', '.join(['?' for _ in values])
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
sql = f'''
INSERT INTO {tbl_name} ({columns})
VALUES ({placeholders})
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
'''
self.cursor.execute(sql, values)
self.conn.commit()
# 获取插入或更新后的记录 ID
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
record_id = self.cursor.fetchone()[0]
return record_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
try:
processed_data = self.check_and_process_data(data, tbl_name)
if processed_data is None:
return None
columns = ', '.join(processed_data.keys())
values = list(processed_data.values())
placeholders = ', '.join(['?' for _ in values])
# 先尝试插入数据
try:
sql = f'''
INSERT INTO {tbl_name} ({columns})
VALUES ({placeholders})
'''
self.cursor.execute(sql, values)
self.conn.commit()
except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作
update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
update_values.append(data[uniq_key])
sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
self.cursor.execute(sql, update_values)
self.conn.commit()
# 获取插入或更新后的记录 ID
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
record_id = self.cursor.fetchone()[0]
return record_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
def get_id_by_key(self, tbl, uniq_key, val):
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
row = self.cursor.fetchone()
return row[0] if row else None
def close(self):
self.cursor.close()
self.conn.close()

View File

@ -20,3 +20,48 @@ class Sis001Item(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
plate_name = scrapy.Field()
class IAFDPersonItem(scrapy.Item):
name = scrapy.Field()
href = scrapy.Field()
from_astro_list = scrapy.Field()
from_birth_list = scrapy.Field()
from_ethnic_list = scrapy.Field()
from_movie_list = scrapy.Field()
class IAFDMovieItem(scrapy.Item):
title = scrapy.Field()
href = scrapy.Field()
release_year = scrapy.Field()
from_performer_list = scrapy.Field()
from_dist_list = scrapy.Field()
from_stu_list = scrapy.Field()
class IAFDPersonDetailItem(scrapy.Item):
href = scrapy.Field()
person = scrapy.Field()
gender = scrapy.Field()
birthday = scrapy.Field()
astrology = scrapy.Field()
birthplace = scrapy.Field()
years_active = scrapy.Field()
ethnicity = scrapy.Field()
nationality = scrapy.Field()
hair_colors = scrapy.Field()
eye_color = scrapy.Field()
height = scrapy.Field()
weight = scrapy.Field()
measurements = scrapy.Field()
tattoos = scrapy.Field()
piercings = scrapy.Field()
movies_cnt = scrapy.Field()
vixen_cnt = scrapy.Field()
blacked_cnt = scrapy.Field()
tushy_cnt = scrapy.Field()
x_art_cnt = scrapy.Field()
performer_aka = scrapy.Field()
class IAFDMovieDetailItem(scrapy.Item):
title = scrapy.Field()
href = scrapy.Field()
# 可以根据实际需求添加更多影片详情字段

View File

@ -98,3 +98,68 @@ class ScrapyProjDownloaderMiddleware:
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
import cloudscraper
from scrapy.http import TextResponse
import datetime
# 使用cloudscraper做代理去请求网站
class CloudScraperMiddleware:
def __init__(self, stats):
self.scraper = cloudscraper.create_scraper()
self.stats = stats # 注入统计对象
# 指定需要使用 cloudscraper 的域名
self.target_domains = {'iafd.com', 'another-domain.com'}
# 设置 headers 和 scraper
self.ifad_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
@classmethod
def from_crawler(cls, crawler):
return cls(
stats=crawler.stats # 获取Scrapy统计对象
)
def process_request(self, request, spider):
# 记录请求开始时间
start_time = datetime.datetime.now()
try:
# 发送请求
response = self.scraper.get(
request.url,
headers=self.ifad_headers,
cookies=request.cookies
)
# 计算请求耗时(毫秒)
duration = (datetime.datetime.now() - start_time).total_seconds() * 1000
# 更新统计数据
self.stats.inc_value('downloader/request_count')
self.stats.inc_value('downloader/request_method_count/GET')
self.stats.inc_value('downloader/request_bytes', len(str(request.headers)) + len(request.url))
self.stats.inc_value('downloader/response_count')
self.stats.inc_value(f'downloader/response_status_count/{response.status_code}')
self.stats.inc_value('downloader/response_bytes', len(response.content))
self.stats.set_value(f'response_received_count', self.stats.get_value('downloader/response_status_count/200', 0))
# 创建Scrapy响应对象
return TextResponse(
url=response.url,
status=response.status_code,
body=response.content,
encoding=response.encoding,
request=request
)
except Exception as e:
# 记录错误
self.stats.inc_value('downloader/exception_count')
self.stats.inc_value(f'downloader/exception_type_count/{e.__class__.__name__}')
spider.logger.error(f"CloudScraper请求失败: {e}")
return None # 失败时使用默认下载器

View File

@ -15,132 +15,8 @@ import os
import sqlite3
import logging
from datetime import datetime
from scrapy_proj.items import U001Item, Sis001Item
home_dir = os.path.expanduser("~")
global_share_data_dir = f'{home_dir}/sharedata'
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
# 数据库基类,封装了通用的操作。
class SQLiteDBHandler:
def __init__(self, db_path=None):
# 使用传入的 db_path 或默认路径
self.DB_PATH = db_path or default_dbpath
# 验证路径是否存在(可选)
if db_path and not os.path.exists(os.path.dirname(db_path)):
os.makedirs(os.path.dirname(db_path))
self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
self.cursor = self.conn.cursor()
# 检查 SQLite 版本
self.lower_sqlite_version = False
sqlite_version = sqlite3.sqlite_version_info
if sqlite_version < (3, 24, 0):
self.lower_sqlite_version = True
def get_table_columns_and_defaults(self, tbl_name):
try:
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
columns = self.cursor.fetchall()
column_info = {}
for col in columns:
col_name = col[1]
default_value = col[4]
column_info[col_name] = default_value
return column_info
except sqlite3.Error as e:
logging.error(f"Error getting table columns: {e}")
return None
def check_and_process_data(self, data, tbl_name):
column_info = self.get_table_columns_and_defaults(tbl_name)
if column_info is None:
return None
processed_data = {}
for col, default in column_info.items():
if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值
continue
if col == 'updated_at': # 日期函数,用户自己指定即可
processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if col in data:
processed_data[col] = data[col]
return processed_data
def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
if self.lower_sqlite_version:
return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
try:
processed_data = self.check_and_process_data(data, tbl_name)
if processed_data is None:
return None
columns = ', '.join(processed_data.keys())
values = list(processed_data.values())
placeholders = ', '.join(['?' for _ in values])
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
sql = f'''
INSERT INTO {tbl_name} ({columns})
VALUES ({placeholders})
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
'''
self.cursor.execute(sql, values)
self.conn.commit()
# 获取插入或更新后的记录 ID
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
record_id = self.cursor.fetchone()[0]
return record_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
try:
processed_data = self.check_and_process_data(data, tbl_name)
if processed_data is None:
return None
columns = ', '.join(processed_data.keys())
values = list(processed_data.values())
placeholders = ', '.join(['?' for _ in values])
# 先尝试插入数据
try:
sql = f'''
INSERT INTO {tbl_name} ({columns})
VALUES ({placeholders})
'''
self.cursor.execute(sql, values)
self.conn.commit()
except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作
update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
update_values.append(data[uniq_key])
sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
self.cursor.execute(sql, update_values)
self.conn.commit()
# 获取插入或更新后的记录 ID
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
record_id = self.cursor.fetchone()[0]
return record_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
def get_id_by_key(self, tbl, uniq_key, val):
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
row = self.cursor.fetchone()
return row[0] if row else None
def close(self):
self.cursor.close()
self.conn.close()
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler
class SQLitePipeline(SQLiteDBHandler):
def __init__(self, db_path=None):
@ -188,6 +64,14 @@ class SQLitePipeline(SQLiteDBHandler):
self._process_u001_item(item)
elif isinstance(item, Sis001Item):
self._process_sis001_item(item)
elif isinstance(item, IAFDPersonItem):
self._process_iafd_person_item(item)
elif isinstance(item, IAFDPersonDetailItem):
self._process_iafd_person_detail_item(item)
elif isinstance(item, IAFDMovieItem):
self._process_iafd_movie_item(item)
elif isinstance(item, IAFDMovieDetailItem):
self._process_iafd_movie_detail_item(item)
return item
def _process_u001_item(self, item):
@ -205,5 +89,17 @@ class SQLitePipeline(SQLiteDBHandler):
))
self.conn.commit()
def _process_iafd_person_item(self, item):
logging.info(f"deal with persion item. {item}")
def _process_iafd_movie_item(self, item):
logging.info(f"deal with movie item. {item}")
def _process_iafd_person_detail_item(self, item):
logging.info(f"deal with persion item. {item}")
def _process_iafd_movie_detail_item(self, item):
logging.info(f"deal with movie item. {item}")
def close_spider(self, spider):
self.conn.close()

View File

@ -30,6 +30,7 @@ ADDONS = {}
# 并发设置
CONCURRENT_REQUESTS = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_ITEMS = 100
# 下载延迟
@ -51,6 +52,7 @@ USER_AGENT_LIST = [
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': None,
'scrapy_proj.middlewares.CloudScraperMiddleware': 543,
}
# settings.py
@ -66,7 +68,7 @@ STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensio
#USER_AGENT = "scrapy_proj (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
#ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

View File

@ -0,0 +1,234 @@
import scrapy
import re
import logging
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
db_tools = IAFDQuery()
class IAFDSpider(scrapy.Spider):
name = "iafd"
allowed_domains = ["iafd.com"]
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
distributors_list_url = f'{host_url}/distrib.asp'
studios_list_url = f"{host_url}/studio.asp"
ethnic_list_url = f'{host_url}/advsearch.asp'
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.cmd_list = cmd
self.update = int(update)
def start_requests(self):
# 按星座获取演员列表
for astro in self.astro_list:
url = self.astr_base_url + astro
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
if self.debug:
break
# 按生日获取演员列表
for month in range(1, 13):
for day in range(1, 32):
url = self.birth_base_url.format(month=month, day=day)
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
if self.debug:
break
# 获取人种列表
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
# 获取 distributors 列表
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
# 获取 studios 列表
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
query_args = {}
if self.debug:
query_args['limit'] = 5
if self.update == 0:
query_args['is_full_data'] = 0
# 读取待更新的演员列表
actors = db_tools.get_performers(**query_args)
if actors:
for item in actors:
href = item.get('href', '')
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
logging.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
# 读取待更新的影片列表
movies = db_tools.get_movies(**query_args)
if movies:
for item in movies:
href = item.get('href', '')
logging.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
async def start(self):
# 调用原有 start_requests 方法
async for request in super().start():
yield request
def parse_astro_page(self, response):
astro = response.meta['astro']
astro_div = response.css('div#astro')
if astro_div:
birth_date = None
for elem in astro_div.css('*'):
if elem.css('h3.astroday'):
birth_date = elem.css('h3.astroday::text').get().strip()
elif elem.css('div.perficon'):
a_tag = elem.css('a')
if a_tag:
href = self.host_url + a_tag.attrib['href']
name = a_tag.css('span.perfname::text').get()
if name:
item = IAFDPersonItem()
item['name'] = name
item['href'] = href
item['from_astro_list'] = 1
item['from_birth_list'] = 0
item['from_ethnic_list'] = 0
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
def parse_birth_page(self, response):
month = response.meta['month']
day = response.meta['day']
datarows = response.css('div.col-sm-12.col-lg-9')
if datarows:
rows = datarows[0].css('div.col-sm-4')
for row in rows:
link_tag = row.css('a')
person = link_tag.css('::text').get().strip() if link_tag else ''
href = self.host_url + link_tag.attrib['href'] if link_tag else ''
item = IAFDPersonItem()
item['name'] = person
item['href'] = href
item['from_astro_list'] = 0
item['from_birth_list'] = 1
item['from_ethnic_list'] = 0
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
def parse_ethnic_list_page(self, response):
div_root = response.css('select#ethnicity1')
if div_root:
options = div_root.css('option')
for option in options:
href = option.attrib.get('value')
text = option.css('::text').get().strip()
if href and href.lower() != 'none':
ethnic_url = self.host_url + href
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
if self.debug:
break
def parse_ethnic_page(self, response):
ethnic = response.meta['ethnic']
rows = response.css('div.row.headshotrow')
for row in rows:
cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
for col in cols:
link_tag = col.css('a')
img_tag = col.css('div.pictag')
if link_tag and img_tag:
href = self.host_url + link_tag.attrib['href']
person = img_tag.css('::text').get().strip()
item = IAFDPersonItem()
item['name'] = person
item['href'] = href
item['from_astro_list'] = 0
item['from_birth_list'] = 0
item['from_ethnic_list'] = 1
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
next_page = response.css('a[rel="next"]')
if next_page:
next_url = self.host_url + next_page.attrib['href']
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
def parse_distributors_list_page(self, response):
select_element = response.css('select[name="Distrib"]')
if select_element:
options = select_element.css('option')
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
dis_url = self.host_url + f"/distrib.rme/distrib={value}"
item = IAFDMovieItem()
item['title'] = text
item['href'] = dis_url
item['release_year'] = 0
item['from_performer_list'] = 0
item['from_dist_list'] = 1
item['from_stu_list'] = 0
yield item
yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
def parse_studios_list_page(self, response):
select_element = response.css('select[name="Studio"]')
if select_element:
options = select_element.css('option')
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
stu_url = self.host_url + f"/studio.rme/studio={value}"
item = IAFDMovieItem()
item['title'] = text
item['href'] = stu_url
item['release_year'] = 0
item['from_performer_list'] = 0
item['from_dist_list'] = 0
item['from_stu_list'] = 1
yield item
yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
def parse_person_detail_page(self, response):
item = IAFDPersonDetailItem()
item['href'] = response.url
item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中
# 解析其他详细信息,根据实际页面结构修改
item['gender'] = response.css('span.gender::text').get()
item['birthday'] = response.css('span.birthday::text').get()
item['astrology'] = response.css('span.astrology::text').get()
item['birthplace'] = response.css('span.birthplace::text').get()
item['years_active'] = response.css('span.years_active::text').get()
item['ethnicity'] = response.css('span.ethnicity::text').get()
item['nationality'] = response.css('span.nationality::text').get()
item['hair_colors'] = response.css('span.hair_colors::text').get()
item['eye_color'] = response.css('span.eye_color::text').get()
item['height'] = response.css('span.height::text').get()
item['weight'] = response.css('span.weight::text').get()
item['measurements'] = response.css('span.measurements::text').get()
item['tattoos'] = response.css('span.tattoos::text').get()
item['piercings'] = response.css('span.piercings::text').get()
item['movies_cnt'] = response.css('span.movies_cnt::text').get()
item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
item['performer_aka'] = response.css('span.performer_aka::text').getall()
yield item
def parse_movie_detail_page(self, response):
item = IAFDMovieDetailItem()
item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中
item['href'] = response.url
# 解析其他详细信息,根据实际页面结构修改
yield item