modify scripts
This commit is contained in:
104
scrapy_proj/scrapy_proj/db_wapper/iafd_query.py
Normal file
104
scrapy_proj/scrapy_proj/db_wapper/iafd_query.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path
|
||||||
|
|
||||||
|
|
||||||
|
class IAFDQuery(SQLiteDBHandler):
|
||||||
|
def __init__(self, db_path=shared_db_path):
|
||||||
|
super().__init__(db_path)
|
||||||
|
self.tbl_name_performers = 'iafd_performers'
|
||||||
|
self.tbl_name_movies = 'iafd_movies'
|
||||||
|
self.uniq_key = 'href'
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def get_performers(self, **filters):
|
||||||
|
try:
|
||||||
|
sql = f"SELECT href, name, id, movies_cnt FROM {self.tbl_name_performers} WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
conditions = {
|
||||||
|
"id": " AND id = ?",
|
||||||
|
"href": " AND href = ?",
|
||||||
|
"name": " AND name LIKE ?",
|
||||||
|
"is_full_data": " AND is_full_data = ?",
|
||||||
|
"start_id": " AND id > ?",
|
||||||
|
}
|
||||||
|
|
||||||
|
for key, condition in conditions.items():
|
||||||
|
if key in filters:
|
||||||
|
sql += condition
|
||||||
|
if key == "name":
|
||||||
|
params.append(f"%{filters[key]}%")
|
||||||
|
else:
|
||||||
|
params.append(filters[key])
|
||||||
|
|
||||||
|
for key in ["is_full_data_in", "is_full_data_not_in"]:
|
||||||
|
if key in filters:
|
||||||
|
values = filters[key]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
operator = "IN" if key == "is_full_data_in" else "NOT IN"
|
||||||
|
sql += f" AND is_full_data {operator} ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
|
||||||
|
if "order_by" in filters:
|
||||||
|
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
|
||||||
|
sql += f" ORDER BY {filters['order_by']} "
|
||||||
|
|
||||||
|
if 'limit' in filters:
|
||||||
|
sql += " LIMIT ?"
|
||||||
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
self.cursor.execute(sql, params)
|
||||||
|
return [dict(row) for row in self.cursor.fetchall()]
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def get_movies(self, **filters):
|
||||||
|
try:
|
||||||
|
sql = f"SELECT href, title, id FROM {self.tbl_name_performers} WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
conditions = {
|
||||||
|
"id": " AND id = ?",
|
||||||
|
"href": " AND href = ?",
|
||||||
|
"title": " AND title LIKE ?",
|
||||||
|
"is_full_data": " AND is_full_data = ?",
|
||||||
|
"start_id": " AND id > ?",
|
||||||
|
}
|
||||||
|
|
||||||
|
for key, condition in conditions.items():
|
||||||
|
if key in filters:
|
||||||
|
sql += condition
|
||||||
|
if key == "name":
|
||||||
|
params.append(f"%{filters[key]}%")
|
||||||
|
else:
|
||||||
|
params.append(filters[key])
|
||||||
|
|
||||||
|
for key in ["is_full_data_in", "is_full_data_not_in"]:
|
||||||
|
if key in filters:
|
||||||
|
values = filters[key]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
operator = "IN" if key == "is_full_data_in" else "NOT IN"
|
||||||
|
sql += f" AND is_full_data {operator} ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
|
||||||
|
if "order_by" in filters:
|
||||||
|
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
|
||||||
|
sql += f" ORDER BY {filters['order_by']} "
|
||||||
|
|
||||||
|
if 'limit' in filters:
|
||||||
|
sql += " LIMIT ?"
|
||||||
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
self.cursor.execute(sql, params)
|
||||||
|
return [dict(row) for row in self.cursor.fetchall()]
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
134
scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
Normal file
134
scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
home_dir = os.path.expanduser("~")
|
||||||
|
global_share_data_dir = f'{home_dir}/sharedata'
|
||||||
|
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
|
||||||
|
shared_db_path = f"{global_share_data_dir}/sqlite/shared.db"
|
||||||
|
|
||||||
|
# 数据库基类,封装了通用的操作。
|
||||||
|
class SQLiteDBHandler:
|
||||||
|
def __init__(self, db_path=None):
|
||||||
|
# 使用传入的 db_path 或默认路径
|
||||||
|
self.DB_PATH = db_path or default_dbpath
|
||||||
|
|
||||||
|
# 验证路径是否存在(可选)
|
||||||
|
if db_path and not os.path.exists(os.path.dirname(db_path)):
|
||||||
|
os.makedirs(os.path.dirname(db_path))
|
||||||
|
|
||||||
|
self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
|
||||||
|
self.conn.execute('PRAGMA journal_mode = WAL') # 启用 WAL(Write-Ahead Logging) 模式
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
self.conn.row_factory = sqlite3.Row # 结果集支持字典式访问
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
|
||||||
|
# 检查 SQLite 版本
|
||||||
|
self.lower_sqlite_version = False
|
||||||
|
sqlite_version = sqlite3.sqlite_version_info
|
||||||
|
if sqlite_version < (3, 24, 0):
|
||||||
|
self.lower_sqlite_version = True
|
||||||
|
|
||||||
|
def get_table_columns_and_defaults(self, tbl_name):
|
||||||
|
try:
|
||||||
|
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
|
||||||
|
columns = self.cursor.fetchall()
|
||||||
|
column_info = {}
|
||||||
|
for col in columns:
|
||||||
|
col_name = col[1]
|
||||||
|
default_value = col[4]
|
||||||
|
column_info[col_name] = default_value
|
||||||
|
return column_info
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error getting table columns: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def check_and_process_data(self, data, tbl_name):
|
||||||
|
column_info = self.get_table_columns_and_defaults(tbl_name)
|
||||||
|
if column_info is None:
|
||||||
|
return None
|
||||||
|
processed_data = {}
|
||||||
|
for col, default in column_info.items():
|
||||||
|
if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值
|
||||||
|
continue
|
||||||
|
if col == 'updated_at': # 日期函数,用户自己指定即可
|
||||||
|
processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
if col in data:
|
||||||
|
processed_data[col] = data[col]
|
||||||
|
|
||||||
|
return processed_data
|
||||||
|
|
||||||
|
def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
|
||||||
|
if self.lower_sqlite_version:
|
||||||
|
return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed_data = self.check_and_process_data(data, tbl_name)
|
||||||
|
if processed_data is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
columns = ', '.join(processed_data.keys())
|
||||||
|
values = list(processed_data.values())
|
||||||
|
placeholders = ', '.join(['?' for _ in values])
|
||||||
|
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
|
||||||
|
|
||||||
|
sql = f'''
|
||||||
|
INSERT INTO {tbl_name} ({columns})
|
||||||
|
VALUES ({placeholders})
|
||||||
|
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
|
||||||
|
'''
|
||||||
|
self.cursor.execute(sql, values)
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
# 获取插入或更新后的记录 ID
|
||||||
|
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
||||||
|
record_id = self.cursor.fetchone()[0]
|
||||||
|
return record_id
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
|
||||||
|
try:
|
||||||
|
processed_data = self.check_and_process_data(data, tbl_name)
|
||||||
|
if processed_data is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
columns = ', '.join(processed_data.keys())
|
||||||
|
values = list(processed_data.values())
|
||||||
|
placeholders = ', '.join(['?' for _ in values])
|
||||||
|
|
||||||
|
# 先尝试插入数据
|
||||||
|
try:
|
||||||
|
sql = f'''
|
||||||
|
INSERT INTO {tbl_name} ({columns})
|
||||||
|
VALUES ({placeholders})
|
||||||
|
'''
|
||||||
|
self.cursor.execute(sql, values)
|
||||||
|
self.conn.commit()
|
||||||
|
except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作
|
||||||
|
update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
|
||||||
|
update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
|
||||||
|
update_values.append(data[uniq_key])
|
||||||
|
sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
|
||||||
|
self.cursor.execute(sql, update_values)
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
# 获取插入或更新后的记录 ID
|
||||||
|
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
||||||
|
record_id = self.cursor.fetchone()[0]
|
||||||
|
return record_id
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_id_by_key(self, tbl, uniq_key, val):
|
||||||
|
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
|
||||||
|
row = self.cursor.fetchone()
|
||||||
|
return row[0] if row else None
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.cursor.close()
|
||||||
|
self.conn.close()
|
||||||
@ -19,4 +19,49 @@ class U001Item(scrapy.Item):
|
|||||||
class Sis001Item(scrapy.Item):
|
class Sis001Item(scrapy.Item):
|
||||||
title = scrapy.Field()
|
title = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
plate_name = scrapy.Field()
|
plate_name = scrapy.Field()
|
||||||
|
|
||||||
|
class IAFDPersonItem(scrapy.Item):
|
||||||
|
name = scrapy.Field()
|
||||||
|
href = scrapy.Field()
|
||||||
|
from_astro_list = scrapy.Field()
|
||||||
|
from_birth_list = scrapy.Field()
|
||||||
|
from_ethnic_list = scrapy.Field()
|
||||||
|
from_movie_list = scrapy.Field()
|
||||||
|
|
||||||
|
class IAFDMovieItem(scrapy.Item):
|
||||||
|
title = scrapy.Field()
|
||||||
|
href = scrapy.Field()
|
||||||
|
release_year = scrapy.Field()
|
||||||
|
from_performer_list = scrapy.Field()
|
||||||
|
from_dist_list = scrapy.Field()
|
||||||
|
from_stu_list = scrapy.Field()
|
||||||
|
|
||||||
|
class IAFDPersonDetailItem(scrapy.Item):
|
||||||
|
href = scrapy.Field()
|
||||||
|
person = scrapy.Field()
|
||||||
|
gender = scrapy.Field()
|
||||||
|
birthday = scrapy.Field()
|
||||||
|
astrology = scrapy.Field()
|
||||||
|
birthplace = scrapy.Field()
|
||||||
|
years_active = scrapy.Field()
|
||||||
|
ethnicity = scrapy.Field()
|
||||||
|
nationality = scrapy.Field()
|
||||||
|
hair_colors = scrapy.Field()
|
||||||
|
eye_color = scrapy.Field()
|
||||||
|
height = scrapy.Field()
|
||||||
|
weight = scrapy.Field()
|
||||||
|
measurements = scrapy.Field()
|
||||||
|
tattoos = scrapy.Field()
|
||||||
|
piercings = scrapy.Field()
|
||||||
|
movies_cnt = scrapy.Field()
|
||||||
|
vixen_cnt = scrapy.Field()
|
||||||
|
blacked_cnt = scrapy.Field()
|
||||||
|
tushy_cnt = scrapy.Field()
|
||||||
|
x_art_cnt = scrapy.Field()
|
||||||
|
performer_aka = scrapy.Field()
|
||||||
|
|
||||||
|
class IAFDMovieDetailItem(scrapy.Item):
|
||||||
|
title = scrapy.Field()
|
||||||
|
href = scrapy.Field()
|
||||||
|
# 可以根据实际需求添加更多影片详情字段
|
||||||
@ -98,3 +98,68 @@ class ScrapyProjDownloaderMiddleware:
|
|||||||
|
|
||||||
def spider_opened(self, spider):
|
def spider_opened(self, spider):
|
||||||
spider.logger.info("Spider opened: %s" % spider.name)
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
import cloudscraper
|
||||||
|
from scrapy.http import TextResponse
|
||||||
|
import datetime
|
||||||
|
# 使用cloudscraper做代理,去请求网站
|
||||||
|
class CloudScraperMiddleware:
|
||||||
|
def __init__(self, stats):
|
||||||
|
self.scraper = cloudscraper.create_scraper()
|
||||||
|
self.stats = stats # 注入统计对象
|
||||||
|
# 指定需要使用 cloudscraper 的域名
|
||||||
|
self.target_domains = {'iafd.com', 'another-domain.com'}
|
||||||
|
|
||||||
|
# 设置 headers 和 scraper
|
||||||
|
self.ifad_headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
return cls(
|
||||||
|
stats=crawler.stats # 获取Scrapy统计对象
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# 记录请求开始时间
|
||||||
|
start_time = datetime.datetime.now()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 发送请求
|
||||||
|
response = self.scraper.get(
|
||||||
|
request.url,
|
||||||
|
headers=self.ifad_headers,
|
||||||
|
cookies=request.cookies
|
||||||
|
)
|
||||||
|
|
||||||
|
# 计算请求耗时(毫秒)
|
||||||
|
duration = (datetime.datetime.now() - start_time).total_seconds() * 1000
|
||||||
|
|
||||||
|
# 更新统计数据
|
||||||
|
self.stats.inc_value('downloader/request_count')
|
||||||
|
self.stats.inc_value('downloader/request_method_count/GET')
|
||||||
|
self.stats.inc_value('downloader/request_bytes', len(str(request.headers)) + len(request.url))
|
||||||
|
|
||||||
|
self.stats.inc_value('downloader/response_count')
|
||||||
|
self.stats.inc_value(f'downloader/response_status_count/{response.status_code}')
|
||||||
|
self.stats.inc_value('downloader/response_bytes', len(response.content))
|
||||||
|
|
||||||
|
self.stats.set_value(f'response_received_count', self.stats.get_value('downloader/response_status_count/200', 0))
|
||||||
|
|
||||||
|
# 创建Scrapy响应对象
|
||||||
|
return TextResponse(
|
||||||
|
url=response.url,
|
||||||
|
status=response.status_code,
|
||||||
|
body=response.content,
|
||||||
|
encoding=response.encoding,
|
||||||
|
request=request
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# 记录错误
|
||||||
|
self.stats.inc_value('downloader/exception_count')
|
||||||
|
self.stats.inc_value(f'downloader/exception_type_count/{e.__class__.__name__}')
|
||||||
|
spider.logger.error(f"CloudScraper请求失败: {e}")
|
||||||
|
return None # 失败时使用默认下载器
|
||||||
|
|||||||
@ -15,132 +15,8 @@ import os
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from scrapy_proj.items import U001Item, Sis001Item
|
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
|
||||||
|
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler
|
||||||
home_dir = os.path.expanduser("~")
|
|
||||||
global_share_data_dir = f'{home_dir}/sharedata'
|
|
||||||
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
|
|
||||||
|
|
||||||
# 数据库基类,封装了通用的操作。
|
|
||||||
class SQLiteDBHandler:
|
|
||||||
def __init__(self, db_path=None):
|
|
||||||
# 使用传入的 db_path 或默认路径
|
|
||||||
self.DB_PATH = db_path or default_dbpath
|
|
||||||
|
|
||||||
# 验证路径是否存在(可选)
|
|
||||||
if db_path and not os.path.exists(os.path.dirname(db_path)):
|
|
||||||
os.makedirs(os.path.dirname(db_path))
|
|
||||||
|
|
||||||
self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
|
|
||||||
self.cursor = self.conn.cursor()
|
|
||||||
|
|
||||||
# 检查 SQLite 版本
|
|
||||||
self.lower_sqlite_version = False
|
|
||||||
sqlite_version = sqlite3.sqlite_version_info
|
|
||||||
if sqlite_version < (3, 24, 0):
|
|
||||||
self.lower_sqlite_version = True
|
|
||||||
|
|
||||||
def get_table_columns_and_defaults(self, tbl_name):
|
|
||||||
try:
|
|
||||||
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
|
|
||||||
columns = self.cursor.fetchall()
|
|
||||||
column_info = {}
|
|
||||||
for col in columns:
|
|
||||||
col_name = col[1]
|
|
||||||
default_value = col[4]
|
|
||||||
column_info[col_name] = default_value
|
|
||||||
return column_info
|
|
||||||
except sqlite3.Error as e:
|
|
||||||
logging.error(f"Error getting table columns: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def check_and_process_data(self, data, tbl_name):
|
|
||||||
column_info = self.get_table_columns_and_defaults(tbl_name)
|
|
||||||
if column_info is None:
|
|
||||||
return None
|
|
||||||
processed_data = {}
|
|
||||||
for col, default in column_info.items():
|
|
||||||
if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值
|
|
||||||
continue
|
|
||||||
if col == 'updated_at': # 日期函数,用户自己指定即可
|
|
||||||
processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
if col in data:
|
|
||||||
processed_data[col] = data[col]
|
|
||||||
|
|
||||||
return processed_data
|
|
||||||
|
|
||||||
def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
|
|
||||||
if self.lower_sqlite_version:
|
|
||||||
return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
|
|
||||||
|
|
||||||
try:
|
|
||||||
processed_data = self.check_and_process_data(data, tbl_name)
|
|
||||||
if processed_data is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
columns = ', '.join(processed_data.keys())
|
|
||||||
values = list(processed_data.values())
|
|
||||||
placeholders = ', '.join(['?' for _ in values])
|
|
||||||
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
|
|
||||||
|
|
||||||
sql = f'''
|
|
||||||
INSERT INTO {tbl_name} ({columns})
|
|
||||||
VALUES ({placeholders})
|
|
||||||
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
|
|
||||||
'''
|
|
||||||
self.cursor.execute(sql, values)
|
|
||||||
self.conn.commit()
|
|
||||||
|
|
||||||
# 获取插入或更新后的记录 ID
|
|
||||||
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
|
||||||
record_id = self.cursor.fetchone()[0]
|
|
||||||
return record_id
|
|
||||||
except sqlite3.Error as e:
|
|
||||||
logging.error(f"Error inserting or updating data: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
|
|
||||||
try:
|
|
||||||
processed_data = self.check_and_process_data(data, tbl_name)
|
|
||||||
if processed_data is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
columns = ', '.join(processed_data.keys())
|
|
||||||
values = list(processed_data.values())
|
|
||||||
placeholders = ', '.join(['?' for _ in values])
|
|
||||||
|
|
||||||
# 先尝试插入数据
|
|
||||||
try:
|
|
||||||
sql = f'''
|
|
||||||
INSERT INTO {tbl_name} ({columns})
|
|
||||||
VALUES ({placeholders})
|
|
||||||
'''
|
|
||||||
self.cursor.execute(sql, values)
|
|
||||||
self.conn.commit()
|
|
||||||
except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作
|
|
||||||
update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
|
|
||||||
update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
|
|
||||||
update_values.append(data[uniq_key])
|
|
||||||
sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
|
|
||||||
self.cursor.execute(sql, update_values)
|
|
||||||
self.conn.commit()
|
|
||||||
|
|
||||||
# 获取插入或更新后的记录 ID
|
|
||||||
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
|
||||||
record_id = self.cursor.fetchone()[0]
|
|
||||||
return record_id
|
|
||||||
except sqlite3.Error as e:
|
|
||||||
logging.error(f"Error inserting or updating data: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_id_by_key(self, tbl, uniq_key, val):
|
|
||||||
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
|
|
||||||
row = self.cursor.fetchone()
|
|
||||||
return row[0] if row else None
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.cursor.close()
|
|
||||||
self.conn.close()
|
|
||||||
|
|
||||||
class SQLitePipeline(SQLiteDBHandler):
|
class SQLitePipeline(SQLiteDBHandler):
|
||||||
def __init__(self, db_path=None):
|
def __init__(self, db_path=None):
|
||||||
@ -188,6 +64,14 @@ class SQLitePipeline(SQLiteDBHandler):
|
|||||||
self._process_u001_item(item)
|
self._process_u001_item(item)
|
||||||
elif isinstance(item, Sis001Item):
|
elif isinstance(item, Sis001Item):
|
||||||
self._process_sis001_item(item)
|
self._process_sis001_item(item)
|
||||||
|
elif isinstance(item, IAFDPersonItem):
|
||||||
|
self._process_iafd_person_item(item)
|
||||||
|
elif isinstance(item, IAFDPersonDetailItem):
|
||||||
|
self._process_iafd_person_detail_item(item)
|
||||||
|
elif isinstance(item, IAFDMovieItem):
|
||||||
|
self._process_iafd_movie_item(item)
|
||||||
|
elif isinstance(item, IAFDMovieDetailItem):
|
||||||
|
self._process_iafd_movie_detail_item(item)
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def _process_u001_item(self, item):
|
def _process_u001_item(self, item):
|
||||||
@ -205,5 +89,17 @@ class SQLitePipeline(SQLiteDBHandler):
|
|||||||
))
|
))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
|
def _process_iafd_person_item(self, item):
|
||||||
|
logging.info(f"deal with persion item. {item}")
|
||||||
|
|
||||||
|
def _process_iafd_movie_item(self, item):
|
||||||
|
logging.info(f"deal with movie item. {item}")
|
||||||
|
|
||||||
|
def _process_iafd_person_detail_item(self, item):
|
||||||
|
logging.info(f"deal with persion item. {item}")
|
||||||
|
|
||||||
|
def _process_iafd_movie_detail_item(self, item):
|
||||||
|
logging.info(f"deal with movie item. {item}")
|
||||||
|
|
||||||
def close_spider(self, spider):
|
def close_spider(self, spider):
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
@ -30,6 +30,7 @@ ADDONS = {}
|
|||||||
|
|
||||||
# 并发设置
|
# 并发设置
|
||||||
CONCURRENT_REQUESTS = 1
|
CONCURRENT_REQUESTS = 1
|
||||||
|
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||||
CONCURRENT_ITEMS = 100
|
CONCURRENT_ITEMS = 100
|
||||||
|
|
||||||
# 下载延迟
|
# 下载延迟
|
||||||
@ -51,6 +52,7 @@ USER_AGENT_LIST = [
|
|||||||
DOWNLOADER_MIDDLEWARES = {
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
|
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
|
||||||
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': None,
|
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': None,
|
||||||
|
'scrapy_proj.middlewares.CloudScraperMiddleware': 543,
|
||||||
}
|
}
|
||||||
|
|
||||||
# settings.py
|
# settings.py
|
||||||
@ -66,7 +68,7 @@ STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensio
|
|||||||
#USER_AGENT = "scrapy_proj (+http://www.yourdomain.com)"
|
#USER_AGENT = "scrapy_proj (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
# Obey robots.txt rules
|
# Obey robots.txt rules
|
||||||
ROBOTSTXT_OBEY = True
|
#ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
#CONCURRENT_REQUESTS = 32
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|||||||
234
scrapy_proj/scrapy_proj/spiders/iafd_spider.py
Normal file
234
scrapy_proj/scrapy_proj/spiders/iafd_spider.py
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
import scrapy
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
||||||
|
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
|
||||||
|
|
||||||
|
db_tools = IAFDQuery()
|
||||||
|
|
||||||
|
class IAFDSpider(scrapy.Spider):
|
||||||
|
name = "iafd"
|
||||||
|
allowed_domains = ["iafd.com"]
|
||||||
|
|
||||||
|
host_url = "https://www.iafd.com"
|
||||||
|
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||||||
|
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||||||
|
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||||||
|
distributors_list_url = f'{host_url}/distrib.asp'
|
||||||
|
studios_list_url = f"{host_url}/studio.asp"
|
||||||
|
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||||||
|
|
||||||
|
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||||
|
self.cmd_list = cmd
|
||||||
|
self.update = int(update)
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
# 按星座获取演员列表
|
||||||
|
for astro in self.astro_list:
|
||||||
|
url = self.astr_base_url + astro
|
||||||
|
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
|
||||||
|
if self.debug:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 按生日获取演员列表
|
||||||
|
for month in range(1, 13):
|
||||||
|
for day in range(1, 32):
|
||||||
|
url = self.birth_base_url.format(month=month, day=day)
|
||||||
|
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
|
||||||
|
if self.debug:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 获取人种列表
|
||||||
|
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
|
||||||
|
|
||||||
|
# 获取 distributors 列表
|
||||||
|
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
|
||||||
|
|
||||||
|
# 获取 studios 列表
|
||||||
|
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
|
||||||
|
|
||||||
|
query_args = {}
|
||||||
|
if self.debug:
|
||||||
|
query_args['limit'] = 5
|
||||||
|
if self.update == 0:
|
||||||
|
query_args['is_full_data'] = 0
|
||||||
|
|
||||||
|
# 读取待更新的演员列表
|
||||||
|
actors = db_tools.get_performers(**query_args)
|
||||||
|
if actors:
|
||||||
|
for item in actors:
|
||||||
|
href = item.get('href', '')
|
||||||
|
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
|
||||||
|
logging.info(f"fetch from db. item: {item}")
|
||||||
|
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
|
||||||
|
|
||||||
|
# 读取待更新的影片列表
|
||||||
|
movies = db_tools.get_movies(**query_args)
|
||||||
|
if movies:
|
||||||
|
for item in movies:
|
||||||
|
href = item.get('href', '')
|
||||||
|
logging.info(f"fetch from db. item: {item}")
|
||||||
|
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
|
||||||
|
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
# 调用原有 start_requests 方法
|
||||||
|
async for request in super().start():
|
||||||
|
yield request
|
||||||
|
|
||||||
|
def parse_astro_page(self, response):
|
||||||
|
astro = response.meta['astro']
|
||||||
|
astro_div = response.css('div#astro')
|
||||||
|
if astro_div:
|
||||||
|
birth_date = None
|
||||||
|
for elem in astro_div.css('*'):
|
||||||
|
if elem.css('h3.astroday'):
|
||||||
|
birth_date = elem.css('h3.astroday::text').get().strip()
|
||||||
|
elif elem.css('div.perficon'):
|
||||||
|
a_tag = elem.css('a')
|
||||||
|
if a_tag:
|
||||||
|
href = self.host_url + a_tag.attrib['href']
|
||||||
|
name = a_tag.css('span.perfname::text').get()
|
||||||
|
if name:
|
||||||
|
item = IAFDPersonItem()
|
||||||
|
item['name'] = name
|
||||||
|
item['href'] = href
|
||||||
|
item['from_astro_list'] = 1
|
||||||
|
item['from_birth_list'] = 0
|
||||||
|
item['from_ethnic_list'] = 0
|
||||||
|
item['from_movie_list'] = 0
|
||||||
|
yield item
|
||||||
|
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
||||||
|
|
||||||
|
def parse_birth_page(self, response):
|
||||||
|
month = response.meta['month']
|
||||||
|
day = response.meta['day']
|
||||||
|
datarows = response.css('div.col-sm-12.col-lg-9')
|
||||||
|
if datarows:
|
||||||
|
rows = datarows[0].css('div.col-sm-4')
|
||||||
|
for row in rows:
|
||||||
|
link_tag = row.css('a')
|
||||||
|
person = link_tag.css('::text').get().strip() if link_tag else ''
|
||||||
|
href = self.host_url + link_tag.attrib['href'] if link_tag else ''
|
||||||
|
|
||||||
|
item = IAFDPersonItem()
|
||||||
|
item['name'] = person
|
||||||
|
item['href'] = href
|
||||||
|
item['from_astro_list'] = 0
|
||||||
|
item['from_birth_list'] = 1
|
||||||
|
item['from_ethnic_list'] = 0
|
||||||
|
item['from_movie_list'] = 0
|
||||||
|
yield item
|
||||||
|
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
||||||
|
|
||||||
|
def parse_ethnic_list_page(self, response):
|
||||||
|
div_root = response.css('select#ethnicity1')
|
||||||
|
if div_root:
|
||||||
|
options = div_root.css('option')
|
||||||
|
for option in options:
|
||||||
|
href = option.attrib.get('value')
|
||||||
|
text = option.css('::text').get().strip()
|
||||||
|
if href and href.lower() != 'none':
|
||||||
|
ethnic_url = self.host_url + href
|
||||||
|
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||||
|
if self.debug:
|
||||||
|
break
|
||||||
|
|
||||||
|
def parse_ethnic_page(self, response):
|
||||||
|
ethnic = response.meta['ethnic']
|
||||||
|
rows = response.css('div.row.headshotrow')
|
||||||
|
for row in rows:
|
||||||
|
cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
|
||||||
|
for col in cols:
|
||||||
|
link_tag = col.css('a')
|
||||||
|
img_tag = col.css('div.pictag')
|
||||||
|
if link_tag and img_tag:
|
||||||
|
href = self.host_url + link_tag.attrib['href']
|
||||||
|
person = img_tag.css('::text').get().strip()
|
||||||
|
|
||||||
|
item = IAFDPersonItem()
|
||||||
|
item['name'] = person
|
||||||
|
item['href'] = href
|
||||||
|
item['from_astro_list'] = 0
|
||||||
|
item['from_birth_list'] = 0
|
||||||
|
item['from_ethnic_list'] = 1
|
||||||
|
item['from_movie_list'] = 0
|
||||||
|
yield item
|
||||||
|
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
||||||
|
|
||||||
|
next_page = response.css('a[rel="next"]')
|
||||||
|
if next_page:
|
||||||
|
next_url = self.host_url + next_page.attrib['href']
|
||||||
|
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
|
||||||
|
|
||||||
|
def parse_distributors_list_page(self, response):
|
||||||
|
select_element = response.css('select[name="Distrib"]')
|
||||||
|
if select_element:
|
||||||
|
options = select_element.css('option')
|
||||||
|
for option in options:
|
||||||
|
value = option.attrib.get('value')
|
||||||
|
text = option.css('::text').get().strip()
|
||||||
|
dis_url = self.host_url + f"/distrib.rme/distrib={value}"
|
||||||
|
item = IAFDMovieItem()
|
||||||
|
item['title'] = text
|
||||||
|
item['href'] = dis_url
|
||||||
|
item['release_year'] = 0
|
||||||
|
item['from_performer_list'] = 0
|
||||||
|
item['from_dist_list'] = 1
|
||||||
|
item['from_stu_list'] = 0
|
||||||
|
yield item
|
||||||
|
yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
|
||||||
|
|
||||||
|
def parse_studios_list_page(self, response):
|
||||||
|
select_element = response.css('select[name="Studio"]')
|
||||||
|
if select_element:
|
||||||
|
options = select_element.css('option')
|
||||||
|
for option in options:
|
||||||
|
value = option.attrib.get('value')
|
||||||
|
text = option.css('::text').get().strip()
|
||||||
|
stu_url = self.host_url + f"/studio.rme/studio={value}"
|
||||||
|
item = IAFDMovieItem()
|
||||||
|
item['title'] = text
|
||||||
|
item['href'] = stu_url
|
||||||
|
item['release_year'] = 0
|
||||||
|
item['from_performer_list'] = 0
|
||||||
|
item['from_dist_list'] = 0
|
||||||
|
item['from_stu_list'] = 1
|
||||||
|
yield item
|
||||||
|
yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
|
||||||
|
|
||||||
|
def parse_person_detail_page(self, response):
|
||||||
|
item = IAFDPersonDetailItem()
|
||||||
|
item['href'] = response.url
|
||||||
|
item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中
|
||||||
|
# 解析其他详细信息,根据实际页面结构修改
|
||||||
|
item['gender'] = response.css('span.gender::text').get()
|
||||||
|
item['birthday'] = response.css('span.birthday::text').get()
|
||||||
|
item['astrology'] = response.css('span.astrology::text').get()
|
||||||
|
item['birthplace'] = response.css('span.birthplace::text').get()
|
||||||
|
item['years_active'] = response.css('span.years_active::text').get()
|
||||||
|
item['ethnicity'] = response.css('span.ethnicity::text').get()
|
||||||
|
item['nationality'] = response.css('span.nationality::text').get()
|
||||||
|
item['hair_colors'] = response.css('span.hair_colors::text').get()
|
||||||
|
item['eye_color'] = response.css('span.eye_color::text').get()
|
||||||
|
item['height'] = response.css('span.height::text').get()
|
||||||
|
item['weight'] = response.css('span.weight::text').get()
|
||||||
|
item['measurements'] = response.css('span.measurements::text').get()
|
||||||
|
item['tattoos'] = response.css('span.tattoos::text').get()
|
||||||
|
item['piercings'] = response.css('span.piercings::text').get()
|
||||||
|
item['movies_cnt'] = response.css('span.movies_cnt::text').get()
|
||||||
|
item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
|
||||||
|
item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
|
||||||
|
item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
|
||||||
|
item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
|
||||||
|
item['performer_aka'] = response.css('span.performer_aka::text').getall()
|
||||||
|
yield item
|
||||||
|
|
||||||
|
def parse_movie_detail_page(self, response):
|
||||||
|
item = IAFDMovieDetailItem()
|
||||||
|
item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中
|
||||||
|
item['href'] = response.url
|
||||||
|
# 解析其他详细信息,根据实际页面结构修改
|
||||||
|
yield item
|
||||||
Reference in New Issue
Block a user