diff --git a/scrapy_proj/cron/cmd.txt b/scrapy_proj/cron/cmd.txt index 880eb91..787b905 100644 --- a/scrapy_proj/cron/cmd.txt +++ b/scrapy_proj/cron/cmd.txt @@ -10,4 +10,4 @@ scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LE scrapy crawl javbus -a cmd=actors -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ -scrapy crawl iafd -a cmd='astro,ethnic,dist,stu' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ \ No newline at end of file +scrapy crawl iafd -a cmd='astro,ethnic,dist,stu' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ -s JOBDIR=/home/ubuntu/sharedata/scrapy_job \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index 74a0314..5f3d411 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -1020,6 +1020,321 @@ class IAFDDBHandler(SQLiteDBHandler): return final_result +@register_handler(comm.SPIDER_NAME_JAVDB) +class JavDBHandler(SQLiteDBHandler): + #def __init__(self, db_path=shared_db_path): + def __init__(self, db_path=test_db_path): + super().__init__(db_path) + self.tbl_javdb_actors = 'javdb_actors' + self.tbl_javdb_alias = 'javdb_actors_alias' + self.tbl_javdb_actor_movies = 'javdb_actors_movies' + self.tbl_javdb_makers = 'javdb_makers' + self.tbl_javdb_movies = 'javdb_movies' + self.tbl_javdb_movie_tags = 'javdb_movies_tags' + self.tbl_javdb_publishers = 'javdb_publishers' + self.tbl_javdb_series = 'javdb_series' + self.tbl_javdb_tags = 'javdb_tags' + + def insert_item(self, item): + # 获取Item中所有定义的字段(包括父类继承的) + all_fields = item.fields.keys() + # 获取已被赋值的字段(存储在Item的内部属性_values中) + assigned_fields = set(item._values.keys()) + # 过滤被赋值过的字段,其他预定义的字段不处理,这样在插入/更新时才不影响无关字段的值 + processed_item = {} + for field in assigned_fields: + processed_item[field] = item[field] + + if isinstance(item, items_def.JavdbActorsItem): + self.insert_or_update_actor(processed_item) + + elif isinstance(item, items_def.JavdbMoviesItem): + self.insert_or_update_movie(processed_item) + + elif isinstance(item, items_def.JavdbMakersItem): + self.insert_or_update_makers(data=processed_item) + + elif isinstance(item, items_def.JavdbSeriesItem): + self.insert_or_update_series(data=processed_item) + + elif isinstance(item, items_def.JavdbPublishersItem): + self.insert_or_update_publishers(data=processed_item) + + else: + logging.error(f"unkown item. {processed_item}") + + return item + + # 统计函数 + def get_stat(self): + stats_config = [ + # 演员相关统计 + {'table': self.tbl_javdb_actors, 'alias': 'actors'}, + + # 电影相关统计 + {'table': self.tbl_javdb_movies, 'alias': 'movies'}, + {'table': self.tbl_javdb_movies, 'alias': 'mov_un', 'where': 'uncensored=1'}, + {'table': self.tbl_javdb_movies, 'alias': 'mov_full', 'where': 'is_full_data=1'}, + {'table': self.tbl_javdb_movies, 'alias': 'mov_un_full', 'where': 'uncensored=1 AND is_full_data=1'}, + + # 其他表统计 + {'table': self.tbl_javdb_makers, 'alias': 'makers'}, + {'table': self.tbl_javdb_publishers, 'alias': 'pubs'}, + {'table': self.tbl_javdb_series, 'alias': 'series'} + ] + return self.generic_stats_query(stats_config) + + + def insert_actor_index(self, name, href, from_actor_list=None, from_movie_list=None): + fields = [ + 'from_actor_list', 'from_movie_list' + ] + data = {'name': name, 'href': href} + # 如果没有传入值,就用原来的值 + for field in fields: + if kwargs.get(field) is not None: + data[field] = kwargs.get(field) + + return self.insert_or_update_common(data=data, tbl_name=self.tbl_javdb_actors, uniq_key='href', exists_do_nothing=False) + + def insert_movie_index(self, title, href, **kwargs): + fields = [ + 'from_actor_list', 'from_movie_makers', 'from_movie_series', 'from_movie_publishers', + 'maker_id', 'series_id', 'pub_id', 'uncensored' + ] + data = {'title': title, 'href': href} + # 如果没有传入值,就用原来的值 + for field in fields: + if kwargs.get(field) is not None: + data[field] = kwargs.get(field) + + return self.insert_or_update_common(data=data, tbl_name=self.tbl_javdb_movies, uniq_key='href', exists_do_nothing=False) + + # 插入演员和电影的关联数据 + def insert_actor_movie(self, performer_id, movie_id, tags=''): + return self.insert_or_update_with_composite_pk( + data = {'actor_id': performer_id, 'movie_id': movie_id, 'tags': tags}, + tbl_name = self.tbl_javdb_actor_movies, + composite_pk = ['actor_id', 'movie_id'], + exists_do_nothing = True + ) + + # 插入演员数据 + def insert_or_update_actor(self, actor): + try: + actor_id = self.insert_or_update_common(data=actor, tbl_name=self.tbl_javdb_actors, uniq_key='href', exists_do_nothing=False) + if not actor_id: + logging.error(f"insert/update acotr error. data:{actor}") + return None + + # 查询刚插入的数据 + self.cursor.execute('SELECT id, from_actor_list FROM javdb_actors WHERE href = ?', (actor['href'],)) + actor_id, uncensored = self.cursor.fetchone() + if actor_id is None: + logging.warning(f'insert data error. name: {actor['name']}, href: {actor['href']}') + return None + + logging.debug(f'insert one actor, id: {actor_id}, name: {actor['name']}, href: {actor['href']}') + + # 插入别名 + for alias in actor.get("alias") or []: + self.insert_or_update_with_composite_pk( + data = {'actor_id': performer_id, 'alias': alias}, + tbl_name = self.tbl_javdb_alias, + composite_pk = ['actor_id', 'alias'], + exists_do_nothing = True + ) + + # 插入影片列表 + for movie in actor.get("credits") or []: + # from_actor_list = 1 表示无码影星的,其他不处理 + if uncensored and uncensored > 0: + movie_id = self.insert_movie_index(movie['title'], movie['href'], from_actor_list=1, uncensored=uncensored) + else: + movie_id = self.insert_movie_index(movie['title'], movie['href'], from_actor_list=1) + if movie_id: + tmp_id = self.insert_actor_movie(actor_id, movie_id) + if tmp_id : + logging.debug(f'insert one performer_movie, performer_id: {actor_id}, movie_id: {movie_id}') + else: + logging.warning(f'insert performer_movie failed. performer_id: {actor_id}, moive href: {movie['href']}') + + return actor_id + except Exception as e: + logging.error(f"插入/更新演员 {actor['name']} 失败: {e}") + self.conn.rollback() + + # """插入或更新电影数据(异常url的处理,比如404链接)""" + def insert_or_update_actor_404(self, name, href, is_full_data=1): + return self.insert_or_update_common( + data={'name': name, 'href':href, 'is_full_data':is_full_data}, + tbl_name=self.tbl_javdb_actors, + uniq_key='href', + exists_do_nothing=False) + + # 查询 # TODO: 表结构需要增加个 movies_cnt 字段 + def query_actors(self, **filters): + return self.generic_query( + table_name = self.tbl_javdb_actors, + fields = ['href', 'name', 'is_full_data', 'movies_cnt'], + filters = filters + ) + + # 插入或更新发行商 """ + def insert_or_update_makers(self, data, caller='list'): + if caller == 'list': + data['from_list'] = 1 + elif caller == 'movie': + data['from_movie_list'] = 1 + + return self.insert_or_update_common(data=data, tbl_name=self.tbl_javdb_makers, uniq_key='href', exists_do_nothing=False) + + # 按条件查询 href 列表 + def query_maker_hrefs(self, **filters): + return self.generic_query( + table_name = self.tbl_javdb_makers, + fields = ['href', 'id', 'from_list'], + filters = filters + ) + + # """ 插入或更新制作公司 """ + def insert_or_update_series(self, data, caller='list'): + if caller == 'list': + data['from_list'] = 1 + elif caller == 'movie': + data['from_movie_list'] = 1 + + return self.insert_or_update_common(data=data, tbl_name=self.tbl_javdb_series, uniq_key='href', exists_do_nothing=False) + + # 按条件查询 href 列表 + def query_series_hrefs(self, **filters): + return self.generic_query( + table_name = self.tbl_javdb_series, + fields = ['href', 'id', 'from_list'], + filters = filters + ) + + # 插入或更新发行商 """ + def insert_or_update_publishers(self, data, caller='list'): + if caller == 'list': + data['from_list'] = 1 + elif caller == 'movie': + data['from_movie_list'] = 1 + + return self.insert_or_update_common(data=data, tbl_name=self.tbl_javdb_publishers, uniq_key='href', exists_do_nothing=False) + + # 按条件查询 href 列表 + def query_publishers_hrefs(self, **filters): + return self.generic_query( + table_name = self.tbl_javdb_publishers, + fields = ['href', 'id'], + filters = filters + ) + + # 插入或更新类别 """ + def insert_or_update_tags(self, name, href): + return self.insert_or_update_common( + data={'name': name, 'href': href}, + tbl_name=self.tbl_javdb_tags, uniq_key='href', exists_do_nothing=True + ) + + # 查询tags + def query_tags(self, href, name): + return self.generic_query( + table_name = self.tbl_javdb_tags, + fields = ['id', 'name', 'href'], + filters = {} + ) + + # 插入影片和tags的关联数据 + def insert_movie_tags(self, movie_id, tag_id, tags=''): + return self.insert_or_update_with_composite_pk( + data = {'tag_id': tag_id, 'movie_id': movie_id, 'tags': tags}, + tbl_name = self.tbl_javdb_movie_tags, + composite_pk = ['tag_id', 'movie_id'], + exists_do_nothing = True + ) + + # """插入或更新电影数据""" + def insert_or_update_movie(self, movie): + try: + # 获取相关 ID + makers_id = self.get_id_by_key(tlb=self.tbl_javdb_makers, uniq_key='href', val=movie['maker_link']) if movie['maker_link'] else None + series_id = self.get_id_by_key(tlb=self.tbl_javdb_series, uniq_key='href', val=movie['series_link']) if movie['series_link'] else None + pub_id = self.get_id_by_key(tlb=self.tbl_javdb_publishers, uniq_key='href', val=movie['pub_link']) if movie['pub_link'] else None + + # 如果不存在,插入 + if makers_id is None and movie['maker_link']: + makers_id = self.insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')}, caller='movie') + if series_id is None and movie['series_link']: + series_id = self.insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')}, caller='movie') + if pub_id is None and movie['pub_link']: + pub_id = self.insert_or_update_publishers({'name' : movie.get('pub_name', ''), 'href' : movie.get('pub_link', '')}, caller='movie') + + movie['maker_id'] = makers_id if makers_id else 0 + movie['series_id'] = series_id if series_id else 0 + movie['pub_id'] = pub_id if pub_id else 0 + movie_id = self.insert_or_update_common(movie, tbl_name=self.tbl_javdb_tags, uniq_key='href', exists_do_nothing=True) + if movie_id is None: + return None + + logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}") + + # 插入 performers_movies 关系表 + for performer in movie.get('actors', []): + performer_id = self.get_id_by_key(tlb=self.tbl_javdb_actors, uniq_key='href', val=performer['href']) + # 如果演员不存在,先插入 + if performer_id is None: + performer_id = self.insert_actor_index(performer['name'], performer['href'], from_movie_list=1) + logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}") + if performer_id: + tmp_id = self.insert_actor_movie(performer_id, movie_id) + if tmp_id: + logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}") + else: + logging.debug(f"insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}") + else: + logging.warning(f"insert perfomer failed. name: {performer['name']}, href: {performer['href']}") + + # 插入 tags 表 + for tag in movie.get('tags', []): + tag_name = tag.get('name', '') + tag_href = tag.get('href', '') + tag_id = self.insert_or_update_tags(tag_name, tag_href) + if tag_id: + logging.debug(f"insert one tags. tag_id: {tag_id}, name: {tag_name}") + tmp_id = self.insert_movie_tags(movie_id=movie_id, tag_id=tag_id, tags=tag_name) + if tmp_id: + logging.debug(f"insert one movie_tag. movie_id: {movie_id}, tag_id: {tag_id}, name: {tag_name}") + else: + logging.warning(f"insert one movie_tag error. movie_id: {movie_id}, tag_id: {tag_id}, name: {tag_name}") + else: + logging.warning(f"insert tags error. name:{tag_name}, href: {tag_href}") + + return movie_id + + except Exception as e: + self.conn.rollback() + logging.error("Error inserting movie: %s", e) + return None + + # """插入或更新电影数据(异常url的处理,比如404链接)""" + def insert_or_update_movie_404(self, title, href, is_full_data=1): + return self.insert_or_update_common( + data={'title': title, 'href':href, 'is_full_data':is_full_data}, + tbl_name=self.tbl_javdb_movies, + uniq_key='href', + exists_do_nothing=False) + + # 查询 + def query_movie_hrefs(self, **filters): + return self.generic_query( + table_name = self.tbl_javdb_movies, + fields = ['href', 'title', 'id'], + filters = filters + ) + + if __name__ == "__main__": db = IAFDDBHandler() print(f"case1, 预期: 有结果:") diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index 9c4011c..fb22d57 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -398,6 +398,9 @@ class JavdbActorsItem(scrapy.Item): is_full_data = scrapy.Field() from_actor_list = scrapy.Field() from_movie_list = scrapy.Field() + # 以下为添加字段 + alias = scrapy.Field() + credits = scrapy.Field() class JavdbActorsAliasItem(scrapy.Item): actor_id = scrapy.Field() @@ -432,6 +435,15 @@ class JavdbMoviesItem(scrapy.Item): from_movie_publishers = scrapy.Field() pub_id = scrapy.Field() uncensored = scrapy.Field() + # 以下为添加字段 + maker_name = scrapy.Field() + maker_link = scrapy.Field() + series_name = scrapy.Field() + series_link = scrapy.Field() + pub_name = scrapy.Field() + pub_link = scrapy.Field() + actors = scrapy.Field() + tags = scrapy.Field() class JavdbMoviesTagsItem(scrapy.Item): movie_id = scrapy.Field() diff --git a/scrapy_proj/scrapy_proj/settings.py b/scrapy_proj/scrapy_proj/settings.py index 9351ed3..7f55951 100644 --- a/scrapy_proj/scrapy_proj/settings.py +++ b/scrapy_proj/scrapy_proj/settings.py @@ -164,3 +164,23 @@ HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" # Set settings whose default value is deprecated to a future-proof value FEED_EXPORT_ENCODING = "utf-8" + +# 设置磁盘队列存储目录 +JOBDIR = 'crawl_state' + +# 调度器配置 +SCHEDULER = "scrapy.core.scheduler.Scheduler" +SCHEDULER_PERSIST = True # 是否持久化调度器状态(避免中断丢失请求) +SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue" +SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue" # 磁盘队列(FIFO) + +# 如果需要 LIFO(栈式调度) +# SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue" +# SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleLifoDiskQueue" + +# Scrapy 自带的 AutoThrottle 可以根据服务器响应速度动态调整爬取速率: +AUTOTHROTTLE_ENABLED = True +AUTOTHROTTLE_START_DELAY = 1 # 初始延迟 +AUTOTHROTTLE_MAX_DELAY = 10 # 最大延迟 +AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0 +AUTOTHROTTLE_DEBUG = False \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/spiders/javdb_spider.py b/scrapy_proj/scrapy_proj/spiders/javdb_spider.py index 859b7ce..cf639a9 100644 --- a/scrapy_proj/scrapy_proj/spiders/javdb_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/javdb_spider.py @@ -1,10 +1,338 @@ import scrapy +import re +import sys +from urllib.parse import urljoin, quote_plus +from scrapy_proj.spiders.base_spider import BaseSpider +from scrapy_proj.items import JavdbActorsAliasItem, JavdbActorsItem, JavdbActorsMoviesItem, JavdbMakersItem, JavdbMoviesItem, JavdbMoviesTagsItem, JavdbPublishersItem, JavdbSeriesItem, JavdbTagsItem +from scrapy_proj.db_wapper.spider_db_handler import JavDBHandler +from scrapy_proj.comm.comm_def import SPIDER_NAME_JAVDB +from scrapy_proj.spiders.parser.javdb_parser import common_parser +from scrapy_proj.utils.utils import pretty_json_simple, normalize_url, generate_multilang_urls, is_valid_url + +db_tools = JavDBHandler() + +class JavdbSpiderSpider(BaseSpider): + name = SPIDER_NAME_JAVDB + allowed_domains = ["javdb.com", "www.javdb.com"] + + # 配置请求头(复用curl中的头部信息) + custom_settings = { + "DEFAULT_REQUEST_HEADERS": { + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', + 'priority': 'u=0, i', + 'referer': 'https://javdb.com/', + 'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + 'sec-fetch-dest': 'document', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-site': 'same-origin', + 'sec-fetch-user': '?1', + 'upgrade-insecure-requests': '1', + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0' + }, + "COOKIES_ENABLED": True # 启用Cookie支持 + } + + host_url = "https://www.javdb.com" + + def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs): + super().__init__(*args, **kwargs) + self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False + self.update_mode = True if mod and mod.lower() == 'update' else False + self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") + + self.cmd_actors = 'actors' + self.cmd_movies = 'movies' + self.cmd_dist = 'dist_list' + self.cmd_list = [self.cmd_actors, self.cmd_movies, self.cmd_dist] + if cmd and cmd != '': + self.cmd_list = cmd.split(',') + + self.existed_actors = {} + self.existed_movies = {} + self.load_existed_actors() + self.load_existed_movies() + + self.requested_url = set() + + # 入口函数,由基类的方法触发 + def custom_start_requests(self): + self.crawler.stats.set_value(f"{self.name}/actor_all", 0) + self.crawler.stats.set_value(f"{self.name}/actor_done", 0) + self.crawler.stats.set_value(f"{self.name}/movie_all", 0) + self.crawler.stats.set_value(f"{self.name}/movie_done", 0) + # 根据命令字执行 + if self.cmd_actors in self.cmd_list: + url = urljoin(self.host_url, "/actors/uncensored") + yield scrapy.Request(url, + callback=self.parser_actor_list, + headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头 + meta={'uncensored':1, 'from_actor_list':1, 'depth':1}) + + ''' + url = urljoin(self.host_url, "/actors/censored") + yield scrapy.Request(url, + callback=self.parser_actor_list, + headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头 + meta={'uncensored':1, 'from_actor_list':1}) + ''' + + # 演员列表页解析 + def parser_actor_list(self, response): + uncensored = response.meta.get('uncensored', 1) + depth = response.meta.get('depth', 1) + if self.debug and depth>=3: + selef.logger.info(f'debug mode. stop next page. url: {response.url}') + return + data, next_url = common_parser(html=response.text, page='actor_list', href=response.url) + if data: + self.logger.info(f"fetched data from {response.url}, data count: {len(data)}") + for item in data: + url = item['href'] + name = item['name'] + # 更新对应语言的姓名 + item = JavdbActorsItem() + item['href'] = url + item["name"] = name + yield item + + # 发起查询详情, + if self._can_request(url) : + yield from self._create_performer_request(href=url, name=name, actor_url=url, depth=1) + + if next_url: + yield scrapy.Request(next_url, + callback=self.parser_actor_list, + headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头 + meta={'uncensored':1, 'from_actor_list':1, 'depth':depth+1} + ) + else: + self._handle_invalid_response(response, page='actor_list') + + # 处理详细的解析页面 + def parse_actor_detail_page(self, response): + actor_url = response.meta.get('actor_url', '') + actor_name = response.meta.get('actor_name', '') + depth = response.meta.get('depth', 1) + if self.debug and depth>=3: + selef.logger.info(f'debug mode. stop next page. url: {response.url}') + return + data, next_url = common_parser(html=response.text, page='actor', href=response.url) + if data: + self.logger.debug(f"fetched data from {response.url}, data: {data}") + + # 判断是否需要更新: 存在完整数据,且影片数量相同 + movies_cnt = data.get('movies_cnt', 0) + if not self.need_update_actor(href=actor_url, movies_cnt=movies_cnt): + self.crawler.stats.inc_value(f"{self.name}/actor_done") + self.logger.info(f"actor ({actor_name}) up to date. movies cnt: {movies_cnt} skipping... url: {actor_url}") + return None + + # 需要更新了,先翻页 + if next_url: + yield from self._create_performer_request(href=next_url, name=actor_name, actor_url=actor_url, depth=depth+1) + else: + self.logger.info(f"actor ({actor_name}) read all pages. url :{response.url}") + self.crawler.stats.inc_value(f"{self.name}/actor_done") + self.add_actor_to_existed(href=actor_url, movies_cnt=movies_cnt) + + # 更新详情数据 + item = JavdbActorsItem() + item['href'] = actor_url + item['name'] = actor_name + item['from_actor_list'] = 1 + item['movies_cnt'] = movies_cnt + item['avatar'] = data.get('avatar', {}) + item['credits'] = data.get('movies', []) + for k, v in data.get('avatar', {}).items(): + if k in item.fields: + item[k] = v + yield item + + # 影片链接,判断是否需要发起 + for item in data.get('movies', []): + yield from self._create_movie_request(href=item['href'], title=item['title']) + else: + self._handle_invalid_response(response, page='actor') -class JavdbSpiderSpider(scrapy.Spider): - name = "javdb_spider" - allowed_domains = ["www.javdb.com"] - start_urls = ["https://www.javdb.com"] + # 统一处理发起影片查询的请求 + def _create_performer_request(self, href, name, actor_url, depth=1): + if href == '': + return + if is_valid_url(href): + if self._can_request(href): + self.crawler.stats.inc_value(f"{self.name}/actor_all") + yield scrapy.Request(href, + callback=self.parse_person_detail_page, + meta={'actor_name': name, 'actor_url': actor_url, 'item_type':'actor', 'depth':depth } + ) + else: + self.logger.warning(f"wrong url. {href}, ignore...") - def parse(self, response): - pass + # 统一处理发起影片查询的请求 + def _create_movie_request(self, href, title): + if href == '': + return + if is_valid_url(href): + if self.need_update_movie(href) and self._can_request(href): + self.crawler.stats.inc_value(f"{self.name}/movie_all") + yield scrapy.Request(href, + callback=self.parse_movie_detail_page, + meta={'title': title, 'item_type':'movie', 'cache':True} + ) + else: + self.logger.warning(f"wrong url. {href}, ignore...") + + # 统一处理发起影片查询的请求 + def _create_movie_list_request(self, href, name, category, depth=1): + if href == '': + return + if is_valid_url(href): + yield scrapy.Request(href, + callback=self.parse_movie_list_page, + meta={'name': name, 'category':category, 'depth':depth} + ) + else: + self.logger.warning(f"wrong url. {href}, ignore...") + + def parse_movie_detail_page(self, response): + title = response.meta.get('title', '') + data = common_parser(html=response.text, page='movies', href=response.url, title=title) + if data: + self.crawler.stats.inc_value(f"{self.name}/movie_done") + self.logger.debug(f"fetched data from {response.url}, data: {data}") + # 把movies信息入库 + item = JavdbMoviesItem() + for k, v in data.items(): + if k in item.fields: + item[k] = v + yield item + + # 处理actors列表 + for actor in data.get('actors', []): + yield from self._create_performer_request(href=actor['href'], name=actor['name'], actor_url=actor['href'], depth=1) + + # 处理 maker + yield from self._create_movie_list_request(href=data['maker_link'], name=data['maker_name'], category='maker', depth=1) + + # 处理 series + yield from self._create_movie_list_request(href=data['series_link'], name=data['series_name'], category='series', depth=1) + + # 处理 pub + yield from self._create_movie_list_request(href=data['pub_link'], name=data['pub_name'], category='pub', depth=1) + + else: + self._handle_invalid_response(response, page='movie') + + # 处理 tags, studio, label, series 列表的公共函数 + def parse_movie_list_page(self, response): + data, next_url = common_parser(html=response.text, page='movie_list', href=response.url) + category = response.meta.get('category', '') + name = response.meta.get('name', '') + depth = response.meta.get('depth', 1) + if self.debug and depth>=3: + self.logger.info(f"debug mode, stop next page. url: {response.url}") + return + + if data: + self.logger.debug(f"fetched data from {response.url}, data: {data}") + # 根据 prefix 获取对应的 Item 类 + ItemClass = ITEM_MAPPING.get(prefix) + if not ItemClass: + self.logger.warning(f"未找到 {prefix} 对应的 Item 类") + return None + + # 影片链接,判断是否需要发起 + for item in data: + yield from self._create_movie_request(href=item['href'], title=item['title']) + + # 处理翻页 + if next_url: + yield from self._create_movie_list_request(href=next_url, name=data['pub_name'], category='pub', depth=1) + yield scrapy.Request(next_url, + callback=self.parse_movie_list_page, + headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头 + meta=response.meta + ) + else: + self.logger.info(f"movies list ({prefix}) read all pages. url :{response.url}") + else: + self._handle_invalid_response(response, page='movie_list') + + + # 统一判断并处理异常 + def _handle_invalid_response(self, response, page=None): + if response.status in [200]: + if "404 Page Not Found" in response.text.lower(): + self.logger.warning(f"404 Page Not Found. url: {response.url}, status_code: {response.status}") + else: + self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}") + elif response.status in [404, 403]: + self.logger.warning(f"get 404 page. url: {response.url}") + else: + self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}") + + if page: + if page == 'actor': + item = JavbusActorsItem() + item['href'] = response.url + item['zh_name'] = response.meta.get('actor_name', '') + item['is_full_data'] = 404 + yield item + elif page == 'movie' : + item = JavbusMoviesItem() + item['href'] = response.url + item['title'] = response.meta.get('title', '') + item['is_full_data'] = 404 + yield item + + # TODO: 表结构需要增加个 movies_cnt 字段 + def load_existed_actors(self): + query_args = {} + rows = db_tools.query_actors(**query_args) + if rows: + for item in rows: + self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']} + else: + self.logger.warning(f"query_actors empty. query args: {query_args}") + + + def load_existed_movies(self): + query_args = {} + rows = db_tools.query_movies(**query_args) + if rows: + for item in rows: + self.existed_movies[item['href']] = item['is_full_data'] + else: + self.logger.warning(f"query_movies empty. query args: {query_args}") + + # 内存缓存,也可以改为查询db + def need_update_movie(self, href): + return not (href in self.existed_movies and self.existed_movies[href] >0) + + # 内存缓存,也可以改为查询db + def need_update_actor(self, href, movies_cnt): + if href not in self.existed_actors: + return True + data = self.existed_actors[href] + if data['is_full_data'] <=0 : + return True + if data['movies_cnt'] < movies_cnt: + return True + + return False + + def add_actor_to_existed(self, href, movies_cnt, is_full_data=1): + self.existed_actors[href] = {'is_full_data': is_full_data, 'movies_cnt': movies_cnt} + + def acc_movie_to_existed(self, href, is_full_data=1): + self.existed_movies[href] = is_full_data + + def _can_request(self, href): + if href in self.requested_url: + return False + self.requested_url.add(href) + return True \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/spiders/parser/javdb_parser.py b/scrapy_proj/scrapy_proj/spiders/parser/javdb_parser.py index ac4fa44..8ea9732 100644 --- a/scrapy_proj/scrapy_proj/spiders/parser/javdb_parser.py +++ b/scrapy_proj/scrapy_proj/spiders/parser/javdb_parser.py @@ -10,7 +10,7 @@ import re from bs4 import BeautifulSoup from requests.exceptions import RequestException from functools import partial -import config +#import config #import utils # 定义基础 URL 和可变参数 @@ -42,30 +42,18 @@ def common_parser(html, page, **kwargs): #parse_actors_uncensored(soup, href): #return list_data, next_url return parse_actors_uncensored(soup, **kwargs) - elif page == 'series_list': - #parse_series_uncensored(soup, href): - #return list_data, next_url - return parse_series_uncensored(soup, **kwargs) - elif page == 'series': - #parse_series_detail(soup, href): - #return list_data, next_url - return parse_series_detail(soup, **kwargs) - elif page == 'makers_list': - #parse_makers_uncensored(soup, href): - #return list_data, next_url - return parse_makers_uncensored(soup, **kwargs) - elif page == 'makers': - #parse_maker_detail(soup, href): - #return list_data, next_url - return parse_maker_detail(soup, **kwargs) - elif page == 'publisher': - #parse_publisher_detail(soup, href): - #return list_data, next_url - return parse_publisher_detail(soup, **kwargs) elif page == 'actor': #parse_actor_detail(soup, href): #return actor, next_url return parse_actor_detail(soup, **kwargs) + elif page == 'makers_series_list': + #parse_maker_series_list(soup, href, category): series / makers + #return list_data, next_url + return parse_maker_series_list(soup, **kwargs) + elif page == 'movie_list': + #parse_movie_list(soup, href): + #return list_data, next_url + return parse_movie_list(soup, **kwargs) elif page == 'movies': #parse_movie_detail(soup, href, title): #return result @@ -226,6 +214,8 @@ def parse_actors_uncensored(soup, href): def parse_actor_detail(soup, href): # 先找一下别名 alias_list = [] + movies_text = '' + movies_cnt = 0 div_meta = soup.find('span', class_='actor-section-name') if not div_meta: @@ -237,7 +227,14 @@ def parse_actor_detail(soup, href): meta_list = alias_div.find_all('span', class_='section-meta') if len(meta_list) > 1: alias_list = meta_list[0].text.strip().split(", ") - + movies_text = meta_list[1].text.strip() + try: + match = re.search(r'(\d+)\s+movie\(s\)', movie_elem.strip(), re.IGNORECASE) + if match: + movies_cnt = int(match.group(1)) + except Exception as e: + movies_cnt = 0 + # 头像 pic = '' avatar = soup.find("div", class_="column actor-avatar") @@ -286,63 +283,14 @@ def parse_actor_detail(soup, href): actor = { 'pic' : pic, - 'alias' : alias_list, + 'movies_text' : movies_text, + 'movies_cnt' : movies_cnt, 'movies' : list_data } return actor, next_url -# 解析 HTML 内容,提取需要的数据 -def parse_movie_detail_old(soup, href, title): - div_video = soup.find("div", class_='video-meta-panel') - if not div_video: - logging.warning(f"Warning: No movies div found ") - return None, None - - # 获取封面图片 - cover_img = soup.select_one('.column-video-cover a') - cover_url = cover_img['href'] if cover_img else None - - # 获取番号 - serial = soup.select_one('.panel-block:first-child .value') - serial_number = serial.text.strip() if serial else None - - # 获取日期 - date = soup.select_one('.panel-block:nth-of-type(2) .value') - release_date = date.text.strip() if date else None - - # 获取时长 - duration = soup.select_one('.panel-block:nth-of-type(3) .value') - video_duration = duration.text.strip() if duration else None - - # 获取片商 - maker = soup.select_one('.panel-block:nth-of-type(4) .value a') - maker_name = maker.text.strip() if maker else None - maker_link = maker['href'] if maker else None - - # 获取系列 - series = soup.select_one('.panel-block:nth-of-type(5) .value a') - series_name = series.text.strip() if series else None - series_link = series['href'] if series else None - - # 获取演员(名字 + 链接) - actors = [{'name': actor.text.strip(), 'href': host_url + actor['href']} for actor in soup.select('.panel-block:nth-of-type(8) .value a')] - - return { - 'href' : href, - 'title' : title, - 'cover_url': cover_url, - 'serial_number': serial_number, - 'release_date': release_date, - 'duration': video_duration, - 'maker_name': maker_name, - 'maker_link': host_url + maker_link if maker_link else '', - 'series_name': series_name, - 'series_link': host_url + series_link if series_link else '', - 'actors': actors - } - # 解析单个元素 def parse_movie_one(soup, keys): key_strong = soup.find('strong', string=lambda text: text in keys) @@ -413,8 +361,8 @@ def parse_movie_detail(soup, href, title): return result # 解析 HTML 内容,提取需要的数据 -def parse_series_uncensored(soup, href): - div_series = soup.find("div", id='series') +def parse_maker_series_list(soup, href, category): + div_series = soup.find("div", id=category) if not div_series: logging.warning(f"Warning: No div_series div found ") return None, None @@ -453,9 +401,8 @@ def parse_series_uncensored(soup, href): return list_data, next_url - # 解析 HTML 内容,提取需要的数据 -def parse_series_detail(soup, href): +def parse_movie_list(soup, href): #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5') div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)')) if not div_movies: @@ -491,162 +438,3 @@ def parse_series_detail(soup, href): next_url = host_url + next_page_url return list_data, next_url - - -# 解析 HTML 内容,提取需要的数据 -def parse_makers_uncensored(soup, href): - div_series = soup.find("div", id='makers') - if not div_series: - logging.warning(f"Warning: No makers div found ") - return None, None - - # 解析元素 - rows = div_series.find_all('a', class_='box') - - list_data = [] - next_url = None - for row in rows: - name = row.find('strong').text.strip() - href = row['href'] - div_movies = row.find('span') - movies = 0 - if div_movies: - match = re.search(r'\((\d+)\)', div_movies.text.strip()) - if match: - movies = int(match.group(1)) - - list_data.append({ - 'name' : name, - 'href' : host_url + href if href else '', - 'movies' : movies - }) - - # 查找 "下一页" 按钮 - next_page_element = soup.find('a', class_='pagination-next') - if next_page_element: - next_page_url = next_page_element['href'] - next_page_number = url_page_num(next_page_url) - current_page_number = url_page_num(href) - if current_page_number is None: - current_page_number = 0 - if next_page_number and next_page_number > current_page_number : - next_url = host_url + next_page_url - - return list_data, next_url - - -# 解析 HTML 内容,提取需要的数据 -def parse_maker_detail(soup, href): - #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5') - div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)')) - if not div_movies: - logging.warning(f"Warning: No movies div found ") - return [], None - - # 解析元素 - rows = div_movies.find_all('div', class_='item') - - list_data = [] - next_url = None - for row in rows: - link = row.find('a', class_='box')['href'] - serial_number = row.find('strong').text.strip() - title = row.find('div', class_='video-title').text.strip() - release_date = row.find('div', class_='meta').text.strip() - list_data.append({ - 'href' : host_url + link if link else '', - 'serial_number' : serial_number, - 'title' : title, - 'release_date': release_date - }) - - # 查找 "下一页" 按钮 - next_page_element = soup.find('a', class_='pagination-next') - if next_page_element: - next_page_url = next_page_element['href'] - next_page_number = url_page_num(next_page_url) - current_page_number = url_page_num(href) - if current_page_number is None: - current_page_number = 0 - if next_page_number and next_page_number > current_page_number : - next_url = host_url + next_page_url - - return list_data, next_url - -# 解析 HTML 内容,提取需要的数据 -def parse_publisher_detail(soup, href): - #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5') - div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)')) - if not div_movies: - logging.warning(f"Warning: No movies div found ") - return [], None - - # 解析元素 - rows = div_movies.find_all('div', class_='item') - - list_data = [] - next_url = None - for row in rows: - link = row.find('a', class_='box')['href'] - serial_number = row.find('strong').text.strip() - title = row.find('div', class_='video-title').text.strip() - release_date = row.find('div', class_='meta').text.strip() - list_data.append({ - 'href' : host_url + link if link else '', - 'serial_number' : serial_number, - 'title' : title, - 'release_date': release_date - }) - - # 查找 "下一页" 按钮 - next_page_element = soup.find('a', class_='pagination-next') - if next_page_element: - next_page_url = next_page_element['href'] - next_page_number = url_page_num(next_page_url) - current_page_number = url_page_num(href) - if current_page_number is None: - current_page_number = 0 - if next_page_number and next_page_number > current_page_number : - next_url = host_url + next_page_url - - return list_data, next_url - - -# 解析 HTML 内容,提取需要的数据 -def parse_uncensored(soup, href): - #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8') - div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)')) - if not div_movies: - logging.warning(f"Warning: No movies div found ") - return [], None - - # 解析元素 - rows = div_movies.find_all('div', class_='item') - - list_data = [] - next_url = None - for row in rows: - link = row.find('a', class_='box')['href'] - serial_number = row.find('strong').text.strip() - title = row.find('div', class_='video-title').text.strip() - release_date = row.find('div', class_='meta').text.strip() - list_data.append({ - 'href' : host_url + link if link else '', - 'serial_number' : serial_number, - 'title' : title, - 'release_date': release_date - }) - - # 查找 "下一页" 按钮 - next_page_element = soup.find('a', class_='pagination-next') - if next_page_element: - next_page_url = next_page_element['href'] - next_page_number = url_page_num(next_page_url) - current_page_number = url_page_num(href) - if current_page_number is None: - current_page_number = 0 - if next_page_number and next_page_number > current_page_number : - next_url = host_url + next_page_url - - return list_data, next_url - diff --git a/scrapy_proj/scrapy_proj/tools/db_tools.py b/scrapy_proj/scrapy_proj/tools/db_tools.py index bd4ebd9..2d9f930 100644 --- a/scrapy_proj/scrapy_proj/tools/db_tools.py +++ b/scrapy_proj/scrapy_proj/tools/db_tools.py @@ -9,7 +9,7 @@ import scrapy_proj.comm.comm_def as comm import scrapy_proj.items as items_def from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url -class IAFDDBHandler(SQLiteDBHandler): +class SharedBHandler(SQLiteDBHandler): def __init__(self, db_path=shared_db_path): super().__init__(db_path) self.tbl_name_performers = 'iafd_performers' diff --git a/scrapy_proj/sqlalchemy/alembic.ini b/scrapy_proj/sqlalchemy/alembic.ini new file mode 100644 index 0000000..0d7a00e --- /dev/null +++ b/scrapy_proj/sqlalchemy/alembic.ini @@ -0,0 +1,147 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# this is typically a path given in POSIX (e.g. forward slashes) +# format, relative to the token %(here)s which refers to the location of this +# ini file +script_location = %(here)s + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. for multiple paths, the path separator +# is defined by "path_separator" below. +prepend_sys_path = . + + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to /versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "path_separator" +# below. +# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions + +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path within configparser +# files such as alembic.ini. +# The default rendered in new alembic.ini files is "os", which uses os.pathsep +# to provide os-dependent path splitting. +# +# Note that in order to support legacy alembic.ini files, this default does NOT +# take place if path_separator is not present in alembic.ini. If this +# option is omitted entirely, fallback logic is as follows: +# +# 1. Parsing of the version_locations option falls back to using the legacy +# "version_path_separator" key, which if absent then falls back to the legacy +# behavior of splitting on spaces and/or commas. +# 2. Parsing of the prepend_sys_path option falls back to the legacy +# behavior of splitting on spaces, commas, or colons. +# +# Valid values for path_separator are: +# +# path_separator = : +# path_separator = ; +# path_separator = space +# path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# database URL. This is consumed by the user-maintained env.py script only. +# other means of configuring database URLs may be customized within the env.py +# file. +sqlalchemy.url = driver://user:pass@localhost/dbname + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module +# hooks = ruff +# ruff.type = module +# ruff.module = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Alternatively, use the exec runner to execute a binary found on your PATH +# hooks = ruff +# ruff.type = exec +# ruff.executable = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration. This is also consumed by the user-maintained +# env.py script only. +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/scrapy_proj/sqlalchemy/migrations/scrapy/README b/scrapy_proj/sqlalchemy/migrations/scrapy/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/scrapy/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/migrations/scrapy/alembic.ini b/scrapy_proj/sqlalchemy/migrations/scrapy/alembic.ini new file mode 100644 index 0000000..0d7a00e --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/scrapy/alembic.ini @@ -0,0 +1,147 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# this is typically a path given in POSIX (e.g. forward slashes) +# format, relative to the token %(here)s which refers to the location of this +# ini file +script_location = %(here)s + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. for multiple paths, the path separator +# is defined by "path_separator" below. +prepend_sys_path = . + + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to /versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "path_separator" +# below. +# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions + +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path within configparser +# files such as alembic.ini. +# The default rendered in new alembic.ini files is "os", which uses os.pathsep +# to provide os-dependent path splitting. +# +# Note that in order to support legacy alembic.ini files, this default does NOT +# take place if path_separator is not present in alembic.ini. If this +# option is omitted entirely, fallback logic is as follows: +# +# 1. Parsing of the version_locations option falls back to using the legacy +# "version_path_separator" key, which if absent then falls back to the legacy +# behavior of splitting on spaces and/or commas. +# 2. Parsing of the prepend_sys_path option falls back to the legacy +# behavior of splitting on spaces, commas, or colons. +# +# Valid values for path_separator are: +# +# path_separator = : +# path_separator = ; +# path_separator = space +# path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# database URL. This is consumed by the user-maintained env.py script only. +# other means of configuring database URLs may be customized within the env.py +# file. +sqlalchemy.url = driver://user:pass@localhost/dbname + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module +# hooks = ruff +# ruff.type = module +# ruff.module = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Alternatively, use the exec runner to execute a binary found on your PATH +# hooks = ruff +# ruff.type = exec +# ruff.executable = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration. This is also consumed by the user-maintained +# env.py script only. +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/scrapy_proj/sqlalchemy/migrations/scrapy/env.py b/scrapy_proj/sqlalchemy/migrations/scrapy/env.py new file mode 100644 index 0000000..cad295e --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/scrapy/env.py @@ -0,0 +1,100 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = None + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + +''' 修改点 +from models.modelclass_b import Base +target_metadata = Base.metadata + +def run_migrations_online(): + url = "sqlite:///../databases/db_b.db" + connectable = create_engine(url) + # 保持其他代码不变 +''' + +import os +from alembic import context +from sqlalchemy import create_engine +from logging.config import fileConfig +from models.scrapy import Base +target_metadata = Base.metadata + +home_dir = os.path.expanduser("~") +global_share_data_dir = f'{home_dir}/sharedata/sqlite' + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + """ + + url = f"sqlite:///{global_share_data_dir}/scrapy.db" + connectable = create_engine(url) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/scrapy_proj/sqlalchemy/migrations/scrapy/script.py.mako b/scrapy_proj/sqlalchemy/migrations/scrapy/script.py.mako new file mode 100644 index 0000000..1101630 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/scrapy/script.py.mako @@ -0,0 +1,28 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + """Upgrade schema.""" + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + """Downgrade schema.""" + ${downgrades if downgrades else "pass"} diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/README b/scrapy_proj/sqlalchemy/migrations/shared_comm/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/shared_comm/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/alembic.ini b/scrapy_proj/sqlalchemy/migrations/shared_comm/alembic.ini new file mode 100644 index 0000000..0d7a00e --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/shared_comm/alembic.ini @@ -0,0 +1,147 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# this is typically a path given in POSIX (e.g. forward slashes) +# format, relative to the token %(here)s which refers to the location of this +# ini file +script_location = %(here)s + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. for multiple paths, the path separator +# is defined by "path_separator" below. +prepend_sys_path = . + + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to /versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "path_separator" +# below. +# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions + +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path within configparser +# files such as alembic.ini. +# The default rendered in new alembic.ini files is "os", which uses os.pathsep +# to provide os-dependent path splitting. +# +# Note that in order to support legacy alembic.ini files, this default does NOT +# take place if path_separator is not present in alembic.ini. If this +# option is omitted entirely, fallback logic is as follows: +# +# 1. Parsing of the version_locations option falls back to using the legacy +# "version_path_separator" key, which if absent then falls back to the legacy +# behavior of splitting on spaces and/or commas. +# 2. Parsing of the prepend_sys_path option falls back to the legacy +# behavior of splitting on spaces, commas, or colons. +# +# Valid values for path_separator are: +# +# path_separator = : +# path_separator = ; +# path_separator = space +# path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# database URL. This is consumed by the user-maintained env.py script only. +# other means of configuring database URLs may be customized within the env.py +# file. +sqlalchemy.url = driver://user:pass@localhost/dbname + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module +# hooks = ruff +# ruff.type = module +# ruff.module = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Alternatively, use the exec runner to execute a binary found on your PATH +# hooks = ruff +# ruff.type = exec +# ruff.executable = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration. This is also consumed by the user-maintained +# env.py script only. +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/env.py b/scrapy_proj/sqlalchemy/migrations/shared_comm/env.py new file mode 100644 index 0000000..4a7a2a3 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/shared_comm/env.py @@ -0,0 +1,101 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = None + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +''' 修改点 +from models.modelclass_b import Base +target_metadata = Base.metadata + +def run_migrations_online(): + url = "sqlite:///../databases/db_b.db" + connectable = create_engine(url) + # 保持其他代码不变 +''' + +import os +from alembic import context +from sqlalchemy import create_engine +from logging.config import fileConfig +from models.shared import Base +target_metadata = Base.metadata + +home_dir = os.path.expanduser("~") +global_share_data_dir = f'{home_dir}/sharedata/sqlite' + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + """ + + url = f"sqlite:///{global_share_data_dir}/shared.db" + connectable = create_engine(url) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/script.py.mako b/scrapy_proj/sqlalchemy/migrations/shared_comm/script.py.mako new file mode 100644 index 0000000..1101630 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/shared_comm/script.py.mako @@ -0,0 +1,28 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + """Upgrade schema.""" + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + """Downgrade schema.""" + ${downgrades if downgrades else "pass"} diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/versions/098c67f16b5e_auto_update_from_shared_comm.py b/scrapy_proj/sqlalchemy/migrations/shared_comm/versions/098c67f16b5e_auto_update_from_shared_comm.py new file mode 100644 index 0000000..ed3cbd9 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/shared_comm/versions/098c67f16b5e_auto_update_from_shared_comm.py @@ -0,0 +1,34 @@ +"""Auto update from shared_comm + +Revision ID: 098c67f16b5e +Revises: +Create Date: 2025-07-29 20:01:58.175086 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '098c67f16b5e' +down_revision: Union[str, Sequence[str], None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('javdb_actors', sa.Column('movies_cnt', sa.Integer(), server_default=sa.text('0'), nullable=False)) + op.add_column('javdb_actors', sa.Column('uncensored', sa.Integer(), server_default=sa.text('0'), nullable=False)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('javdb_actors', 'uncensored') + op.drop_column('javdb_actors', 'movies_cnt') + # ### end Alembic commands ### diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/README b/scrapy_proj/sqlalchemy/migrations/testdb/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/testdb/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/alembic.ini b/scrapy_proj/sqlalchemy/migrations/testdb/alembic.ini new file mode 100644 index 0000000..0d7a00e --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/testdb/alembic.ini @@ -0,0 +1,147 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# this is typically a path given in POSIX (e.g. forward slashes) +# format, relative to the token %(here)s which refers to the location of this +# ini file +script_location = %(here)s + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. for multiple paths, the path separator +# is defined by "path_separator" below. +prepend_sys_path = . + + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to /versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "path_separator" +# below. +# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions + +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path within configparser +# files such as alembic.ini. +# The default rendered in new alembic.ini files is "os", which uses os.pathsep +# to provide os-dependent path splitting. +# +# Note that in order to support legacy alembic.ini files, this default does NOT +# take place if path_separator is not present in alembic.ini. If this +# option is omitted entirely, fallback logic is as follows: +# +# 1. Parsing of the version_locations option falls back to using the legacy +# "version_path_separator" key, which if absent then falls back to the legacy +# behavior of splitting on spaces and/or commas. +# 2. Parsing of the prepend_sys_path option falls back to the legacy +# behavior of splitting on spaces, commas, or colons. +# +# Valid values for path_separator are: +# +# path_separator = : +# path_separator = ; +# path_separator = space +# path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# database URL. This is consumed by the user-maintained env.py script only. +# other means of configuring database URLs may be customized within the env.py +# file. +sqlalchemy.url = driver://user:pass@localhost/dbname + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module +# hooks = ruff +# ruff.type = module +# ruff.module = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Alternatively, use the exec runner to execute a binary found on your PATH +# hooks = ruff +# ruff.type = exec +# ruff.executable = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration. This is also consumed by the user-maintained +# env.py script only. +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/env.py b/scrapy_proj/sqlalchemy/migrations/testdb/env.py new file mode 100644 index 0000000..a45b006 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/testdb/env.py @@ -0,0 +1,108 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = None + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +''' 修改点 +from models.modelclass_b import Base +target_metadata = Base.metadata + +def run_migrations_online(): + url = "sqlite:///../databases/db_b.db" + connectable = create_engine(url) + # 保持其他代码不变 +''' +import os +from alembic import context +from sqlalchemy import create_engine +from logging.config import fileConfig +from models.shared import Base as BaseShared +from models.scrapy import Base as BaseScrapy + +# 合并元数据 +from sqlalchemy import MetaData +target_metadata = MetaData() +for t in BaseShared.metadata.tables.values(): + t.tometadata(target_metadata) +for t in BaseScrapy.metadata.tables.values(): + t.tometadata(target_metadata) + +home_dir = os.path.expanduser("~") +global_share_data_dir = f'{home_dir}/sharedata/sqlite' + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + """ + + url = f"sqlite:///{global_share_data_dir}/test.db" + connectable = create_engine(url) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/script.py.mako b/scrapy_proj/sqlalchemy/migrations/testdb/script.py.mako new file mode 100644 index 0000000..1101630 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/testdb/script.py.mako @@ -0,0 +1,28 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + """Upgrade schema.""" + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + """Downgrade schema.""" + ${downgrades if downgrades else "pass"} diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/versions/854378c8e332_initial_test_schema.py b/scrapy_proj/sqlalchemy/migrations/testdb/versions/854378c8e332_initial_test_schema.py new file mode 100644 index 0000000..50c46b6 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/testdb/versions/854378c8e332_initial_test_schema.py @@ -0,0 +1,34 @@ +"""Initial test schema + +Revision ID: 854378c8e332 +Revises: bce23e0d0c3a +Create Date: 2025-07-29 19:58:01.503647 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '854378c8e332' +down_revision: Union[str, Sequence[str], None] = 'bce23e0d0c3a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('javdb_actors', sa.Column('movies_cnt', sa.Integer(), server_default=sa.text('0'), nullable=False)) + op.add_column('javdb_actors', sa.Column('uncensored', sa.Integer(), server_default=sa.text('0'), nullable=False)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('javdb_actors', 'uncensored') + op.drop_column('javdb_actors', 'movies_cnt') + # ### end Alembic commands ### diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/versions/86eea10972c0_initial_test_schema.py b/scrapy_proj/sqlalchemy/migrations/testdb/versions/86eea10972c0_initial_test_schema.py new file mode 100644 index 0000000..ced70c8 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/testdb/versions/86eea10972c0_initial_test_schema.py @@ -0,0 +1,674 @@ +"""Initial test schema + +Revision ID: 86eea10972c0 +Revises: +Create Date: 2025-07-29 19:25:41.776214 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '86eea10972c0' +down_revision: Union[str, Sequence[str], None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('clm_index', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('category', sa.Text(), nullable=True), + sa.Column('title', sa.Text(), nullable=True), + sa.Column('href', sa.Text(), nullable=True), + sa.Column('magnet_href', sa.Text(), nullable=True), + sa.Column('size_text', sa.Text(), nullable=True), + sa.Column('size_gb', sa.REAL(), nullable=True), + sa.Column('heat', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('add_date', sa.Text(), nullable=True), + sa.Column('last_down_date', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('href') + ) + op.create_table('clm_keywords', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('words', sa.Text(), nullable=True), + sa.Column('groups', sa.Text(), nullable=True), + sa.Column('tags', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('index_count', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('words') + ) + op.create_table('iafd_distributors', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('details', sa.Text(), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('iafd_meta_ethnic', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('iafd_performers', + sa.Column('name', sa.Text(), nullable=False), + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('birth_year', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_astro_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_birth_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_ethnic_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('gender', sa.Text(), nullable=True), + sa.Column('birthday', sa.Text(), nullable=True), + sa.Column('astrology', sa.Text(), nullable=True), + sa.Column('birthplace', sa.Text(), nullable=True), + sa.Column('years_active', sa.Text(), nullable=True), + sa.Column('ethnicity', sa.Text(), nullable=True), + sa.Column('nationality', sa.Text(), nullable=True), + sa.Column('hair_colors', sa.Text(), nullable=True), + sa.Column('eye_color', sa.Text(), nullable=True), + sa.Column('height_str', sa.Text(), nullable=True), + sa.Column('weight_str', sa.Text(), nullable=True), + sa.Column('measurements', sa.Text(), nullable=True), + sa.Column('tattoos', sa.Text(), nullable=True), + sa.Column('piercings', sa.Text(), nullable=True), + sa.Column('fake_tits', sa.Text(), nullable=True), + sa.Column('href', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('weight', sa.Integer(), nullable=True), + sa.Column('height', sa.Integer(), nullable=True), + sa.Column('rating', sa.Integer(), nullable=True), + sa.Column('movies_cnt', sa.Integer(), nullable=True), + sa.Column('vixen_cnt', sa.Integer(), nullable=True), + sa.Column('blacked_cnt', sa.Integer(), nullable=True), + sa.Column('tushy_cnt', sa.Integer(), nullable=True), + sa.Column('x_art_cnt', sa.Integer(), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('href') + ) + op.create_table('iafd_studios', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('details', sa.Text(), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('iafd_task_log', + sa.Column('task_id', sa.Integer(), nullable=False), + sa.Column('full_data_performers', sa.Integer(), nullable=True), + sa.Column('total_performers', sa.Integer(), nullable=True), + sa.Column('full_data_movies', sa.Integer(), nullable=True), + sa.Column('total_movies', sa.Integer(), nullable=True), + sa.Column('total_distributors', sa.Integer(), nullable=True), + sa.Column('total_studios', sa.Integer(), nullable=True), + sa.Column('task_status', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('task_id') + ) + op.create_table('javbus_actors', + sa.Column('uncensored', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_actor_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('movies_cnt', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('ja_name', sa.Text(), nullable=True), + sa.Column('zh_name', sa.Text(), nullable=True), + sa.Column('en_name', sa.Text(), nullable=True), + sa.Column('href', sa.Text(), nullable=True), + sa.Column('pic', sa.Text(), nullable=True), + sa.Column('birth_date', sa.Text(), nullable=True), + sa.Column('height', sa.Text(), nullable=True), + sa.Column('breast_size', sa.Text(), nullable=True), + sa.Column('measurements', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('href') + ) + op.create_table('javbus_labels', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=True), + sa.Column('en_name', sa.String(length=255), nullable=True), + sa.Column('ja_name', sa.String(length=255), nullable=True), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('details', sa.Text(), nullable=True), + sa.Column('uncensored', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('movies_cnt', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('magnet_cnt', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('javbus_movies', + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('uncensored', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_actor_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_studios', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_labels', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_series', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('actors_cnt', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('href', sa.Text(), nullable=True), + sa.Column('title', sa.Text(), nullable=True), + sa.Column('cover_url', sa.Text(), nullable=True), + sa.Column('serial_number', sa.Text(), nullable=True), + sa.Column('release_date', sa.Text(), nullable=True), + sa.Column('duration', sa.Text(), nullable=True), + sa.Column('studio_id', sa.Integer(), nullable=True), + sa.Column('label_id', sa.Integer(), nullable=True), + sa.Column('series_id', sa.Integer(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('href') + ) + op.create_table('javbus_series', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=True), + sa.Column('en_name', sa.String(length=255), nullable=True), + sa.Column('ja_name', sa.String(length=255), nullable=True), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('details', sa.Text(), nullable=True), + sa.Column('uncensored', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('movies_cnt', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('magnet_cnt', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('javbus_studios', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=True), + sa.Column('en_name', sa.String(length=255), nullable=True), + sa.Column('ja_name', sa.String(length=255), nullable=True), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('details', sa.Text(), nullable=True), + sa.Column('uncensored', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('movies_cnt', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('magnet_cnt', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('javbus_tags', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=True), + sa.Column('en_name', sa.String(length=255), nullable=True), + sa.Column('ja_name', sa.String(length=255), nullable=True), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('javdb_actors', + sa.Column('name', sa.Text(), nullable=False), + sa.Column('href', sa.Text(), nullable=False), + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_actor_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('pic', sa.Text(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.DateTime(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('href') + ) + op.create_table('javdb_makers', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('details', sa.Text(), nullable=True), + sa.Column('from_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('javdb_movies', + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_actor_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_makers', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_series', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_movie_publishers', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('uncensored', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('href', sa.Text(), nullable=True), + sa.Column('title', sa.Text(), nullable=True), + sa.Column('cover_url', sa.Text(), nullable=True), + sa.Column('serial_number', sa.Text(), nullable=True), + sa.Column('release_date', sa.Text(), nullable=True), + sa.Column('duration', sa.Text(), nullable=True), + sa.Column('maker_id', sa.Text(), nullable=True), + sa.Column('series_id', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('pub_id', sa.Integer(), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('href') + ) + op.create_table('javdb_publishers', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('details', sa.Text(), nullable=True), + sa.Column('from_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('javdb_series', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('details', sa.Text(), nullable=True), + sa.Column('from_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.Column('from_movie_list', sa.Integer(), server_default=sa.text('0'), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('javdb_tags', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('javdb_task_log', + sa.Column('task_id', sa.Integer(), nullable=False), + sa.Column('full_data_actors', sa.Integer(), nullable=True), + sa.Column('total_actors', sa.Integer(), nullable=True), + sa.Column('full_data_movies', sa.Integer(), nullable=True), + sa.Column('total_movies', sa.Integer(), nullable=True), + sa.Column('total_makers', sa.Integer(), nullable=True), + sa.Column('total_series', sa.Integer(), nullable=True), + sa.Column('task_status', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('task_id') + ) + op.create_table('javhd_models', + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('rank', sa.Integer(), nullable=True), + sa.Column('ja_name', sa.Text(), nullable=True), + sa.Column('zh_name', sa.Text(), nullable=True), + sa.Column('en_name', sa.Text(), nullable=True), + sa.Column('url', sa.Text(), nullable=True), + sa.Column('pic', sa.Text(), nullable=True), + sa.Column('height', sa.Text(), nullable=True), + sa.Column('weight', sa.Text(), nullable=True), + sa.Column('breast_size', sa.Text(), nullable=True), + sa.Column('breast_factor', sa.Text(), nullable=True), + sa.Column('hair_color', sa.Text(), nullable=True), + sa.Column('eye_color', sa.Text(), nullable=True), + sa.Column('birth_date', sa.Text(), nullable=True), + sa.Column('ethnicity', sa.Text(), nullable=True), + sa.Column('birth_place', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('url') + ) + op.create_table('pbox_actors', + sa.Column('name', sa.Text(), nullable=False), + sa.Column('movies_cnt', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('href', sa.Text(), nullable=True), + sa.Column('gender', sa.Text(), nullable=True), + sa.Column('age', sa.Integer(), nullable=True), + sa.Column('nationality', sa.Text(), nullable=True), + sa.Column('country', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('href') + ) + op.create_table('pbox_studios', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('label_id', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('scene_count', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('description', sa.Text(), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('pbox_tags', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('tag_id', sa.Integer(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('sis', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('plate_name', sa.Text(), nullable=True), + sa.Column('title', sa.Text(), nullable=True), + sa.Column('url', sa.Text(), nullable=True), + sa.Column('size_text', sa.Text(), nullable=True), + sa.Column('size_gb', sa.REAL(), nullable=True), + sa.Column('update_date', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('url') + ) + op.create_table('thelordofporn_actress', + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('pornstar', sa.Text(), nullable=True), + sa.Column('rating', sa.REAL(), nullable=True), + sa.Column('rank', sa.Integer(), nullable=True), + sa.Column('votes', sa.Integer(), nullable=True), + sa.Column('href', sa.Text(), nullable=True), + sa.Column('career_start', sa.Text(), nullable=True), + sa.Column('measurements', sa.Text(), nullable=True), + sa.Column('born', sa.Text(), nullable=True), + sa.Column('height', sa.Text(), nullable=True), + sa.Column('weight', sa.Text(), nullable=True), + sa.Column('date_modified', sa.Text(), nullable=True), + sa.Column('global_rank', sa.Integer(), nullable=True), + sa.Column('weekly_rank', sa.Integer(), nullable=True), + sa.Column('last_month_rating', sa.REAL(), nullable=True), + sa.Column('current_rating', sa.REAL(), nullable=True), + sa.Column('total_votes', sa.Integer(), nullable=True), + sa.Column('birth_date', sa.Text(), nullable=True), + sa.Column('birth_year', sa.Text(), nullable=True), + sa.Column('birth_place', sa.Text(), nullable=True), + sa.Column('height_ft', sa.Text(), nullable=True), + sa.Column('height_cm', sa.Text(), nullable=True), + sa.Column('weight_lbs', sa.Text(), nullable=True), + sa.Column('weight_kg', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('href') + ) + op.create_table('u3c3', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('category', sa.Text(), nullable=True), + sa.Column('title', sa.Text(), nullable=True), + sa.Column('url', sa.Text(), nullable=True), + sa.Column('torrent_url', sa.Text(), nullable=True), + sa.Column('magnet_url', sa.Text(), nullable=True), + sa.Column('size_text', sa.Text(), nullable=True), + sa.Column('size_gb', sa.REAL(), nullable=True), + sa.Column('update_date', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('url') + ) + op.create_table('clm_keywords_index', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('words_id', sa.Integer(), nullable=True), + sa.Column('index_id', sa.Integer(), nullable=True), + sa.Column('wid_iid', sa.String(length=255), nullable=True), + sa.Column('tags', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['index_id'], ['clm_index.id'], ), + sa.ForeignKeyConstraint(['words_id'], ['clm_keywords.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('iafd_movies', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('release_year', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_performer_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_dist_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('from_stu_list', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('title', sa.String(length=255), nullable=True), + sa.Column('minutes', sa.String(length=255), nullable=True), + sa.Column('distributor_id', sa.Integer(), nullable=True), + sa.Column('studio_id', sa.Integer(), nullable=True), + sa.Column('release_date', sa.String(length=255), nullable=True), + sa.Column('added_to_IAFD_date', sa.String(length=255), nullable=True), + sa.Column('all_girl', sa.String(length=255), nullable=True), + sa.Column('all_male', sa.String(length=255), nullable=True), + sa.Column('compilation', sa.String(length=255), nullable=True), + sa.Column('webscene', sa.String(length=255), nullable=True), + sa.Column('director_id', sa.Integer(), nullable=True), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['distributor_id'], ['iafd_distributors.id'], ), + sa.ForeignKeyConstraint(['studio_id'], ['iafd_studios.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('iafd_performer_aliases', + sa.Column('performer_id', sa.Integer(), nullable=False), + sa.Column('alias', sa.String(length=255), nullable=False), + sa.ForeignKeyConstraint(['performer_id'], ['iafd_performers.id'], ), + sa.PrimaryKeyConstraint('performer_id', 'alias') + ) + op.create_table('iafd_performer_urls', + sa.Column('performer_id', sa.Integer(), nullable=False), + sa.Column('position', sa.String(length=255), nullable=False), + sa.Column('url', sa.String(length=255), nullable=False), + sa.ForeignKeyConstraint(['performer_id'], ['iafd_performers.id'], ), + sa.PrimaryKeyConstraint('performer_id', 'position', 'url') + ) + op.create_table('javbus_actors_movies', + sa.Column('actor_id', sa.Integer(), nullable=False), + sa.Column('movie_id', sa.Integer(), nullable=False), + sa.Column('tags', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['actor_id'], ['javbus_actors.id'], ), + sa.ForeignKeyConstraint(['movie_id'], ['javbus_movies.id'], ), + sa.PrimaryKeyConstraint('actor_id', 'movie_id') + ) + op.create_index('idx_actor_movie_actor_id', 'javbus_actors_movies', ['actor_id'], unique=False) + op.create_table('javbus_movies_tags', + sa.Column('movie_id', sa.Integer(), nullable=False), + sa.Column('tag_id', sa.Integer(), nullable=False), + sa.Column('tags', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['movie_id'], ['javbus_movies.id'], ), + sa.ForeignKeyConstraint(['tag_id'], ['javbus_tags.id'], ), + sa.PrimaryKeyConstraint('movie_id', 'tag_id') + ) + op.create_table('javdb_actors_alias', + sa.Column('actor_id', sa.Integer(), nullable=False), + sa.Column('alias', sa.Text(), nullable=False), + sa.Column('created_at', sa.DateTime(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.DateTime(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['actor_id'], ['javdb_actors.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('actor_id', 'alias') + ) + op.create_table('javdb_actors_movies', + sa.Column('actor_id', sa.Integer(), nullable=False), + sa.Column('movie_id', sa.Integer(), nullable=False), + sa.Column('tags', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), nullable=True), + sa.ForeignKeyConstraint(['actor_id'], ['javdb_actors.id'], ), + sa.ForeignKeyConstraint(['movie_id'], ['javdb_movies.id'], ), + sa.PrimaryKeyConstraint('actor_id', 'movie_id') + ) + op.create_table('javdb_movies_tags', + sa.Column('movie_id', sa.Integer(), nullable=False), + sa.Column('tag_id', sa.Integer(), nullable=False), + sa.Column('tags', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['movie_id'], ['javdb_movies.id'], ), + sa.ForeignKeyConstraint(['tag_id'], ['javdb_tags.id'], ), + sa.PrimaryKeyConstraint('movie_id', 'tag_id') + ) + op.create_table('pbox_actor_aliases', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('actor_id', sa.Integer(), nullable=False), + sa.Column('alias', sa.String(length=255), nullable=False), + sa.Column('actor_alias', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['actor_id'], ['pbox_actors.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('pbox_movies', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('is_full_data', sa.Integer(), server_default=sa.text('0'), nullable=False), + sa.Column('href', sa.String(length=255), nullable=True), + sa.Column('title', sa.String(length=255), nullable=True), + sa.Column('movie_id', sa.Integer(), nullable=True), + sa.Column('content_id', sa.Integer(), nullable=True), + sa.Column('duration', sa.String(length=255), nullable=True), + sa.Column('publish_date', sa.String(length=255), nullable=True), + sa.Column('release_date', sa.String(length=255), nullable=True), + sa.Column('studio_id', sa.Integer(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['studio_id'], ['pbox_studios.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('thelordofporn_alias', + sa.Column('actress_id', sa.Integer(), nullable=False), + sa.Column('alias', sa.Text(), nullable=False), + sa.Column('updated_at', sa.Text(), nullable=True), + sa.ForeignKeyConstraint(['actress_id'], ['thelordofporn_actress.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('actress_id', 'alias') + ) + op.create_table('iafd_movies_appers_in', + sa.Column('movie_id', sa.Integer(), nullable=False), + sa.Column('appears_in_id', sa.Integer(), nullable=False), + sa.Column('gradation', sa.Integer(), nullable=True), + sa.Column('notes', sa.String(length=255), nullable=True), + sa.ForeignKeyConstraint(['appears_in_id'], ['iafd_movies.id'], ), + sa.ForeignKeyConstraint(['movie_id'], ['iafd_movies.id'], ), + sa.PrimaryKeyConstraint('movie_id', 'appears_in_id') + ) + op.create_table('iafd_performers_movies', + sa.Column('performer_id', sa.Integer(), nullable=False), + sa.Column('movie_id', sa.Integer(), nullable=False), + sa.Column('role', sa.String(length=255), nullable=True), + sa.Column('notes', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['movie_id'], ['iafd_movies.id'], ), + sa.ForeignKeyConstraint(['performer_id'], ['iafd_performers.id'], ), + sa.PrimaryKeyConstraint('performer_id', 'movie_id') + ) + op.create_index('idx_iafd_performers_movies_performer_id', 'iafd_performers_movies', ['performer_id'], unique=False) + op.create_table('pbox_actors_movies', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('actor_id', sa.Integer(), nullable=True), + sa.Column('movie_id', sa.Integer(), nullable=True), + sa.Column('actor_mov', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('tags', sa.Text(), nullable=True), + sa.ForeignKeyConstraint(['actor_id'], ['pbox_actors.id'], ), + sa.ForeignKeyConstraint(['movie_id'], ['pbox_movies.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('pbox_movies_alts', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('min_mov_id', sa.Integer(), nullable=True), + sa.Column('max_mov_id', sa.Integer(), nullable=True), + sa.Column('min_max', sa.String(length=255), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['max_mov_id'], ['pbox_movies.id'], ), + sa.ForeignKeyConstraint(['min_mov_id'], ['pbox_movies.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('pbox_movies_tags', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('movie_id', sa.Integer(), nullable=True), + sa.Column('tag_id', sa.Integer(), nullable=True), + sa.Column('movid_tagid', sa.String(length=255), nullable=True), + sa.Column('tags', sa.Text(), nullable=True), + sa.Column('created_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.Column('updated_at', sa.Text(), server_default=sa.text("(datetime('now', 'localtime'))"), nullable=True), + sa.ForeignKeyConstraint(['movie_id'], ['pbox_movies.id'], ), + sa.ForeignKeyConstraint(['tag_id'], ['pbox_tags.id'], ), + sa.PrimaryKeyConstraint('id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('pbox_movies_tags') + op.drop_table('pbox_movies_alts') + op.drop_table('pbox_actors_movies') + op.drop_index('idx_iafd_performers_movies_performer_id', table_name='iafd_performers_movies') + op.drop_table('iafd_performers_movies') + op.drop_table('iafd_movies_appers_in') + op.drop_table('thelordofporn_alias') + op.drop_table('pbox_movies') + op.drop_table('pbox_actor_aliases') + op.drop_table('javdb_movies_tags') + op.drop_table('javdb_actors_movies') + op.drop_table('javdb_actors_alias') + op.drop_table('javbus_movies_tags') + op.drop_index('idx_actor_movie_actor_id', table_name='javbus_actors_movies') + op.drop_table('javbus_actors_movies') + op.drop_table('iafd_performer_urls') + op.drop_table('iafd_performer_aliases') + op.drop_table('iafd_movies') + op.drop_table('clm_keywords_index') + op.drop_table('u3c3') + op.drop_table('thelordofporn_actress') + op.drop_table('sis') + op.drop_table('pbox_tags') + op.drop_table('pbox_studios') + op.drop_table('pbox_actors') + op.drop_table('javhd_models') + op.drop_table('javdb_task_log') + op.drop_table('javdb_tags') + op.drop_table('javdb_series') + op.drop_table('javdb_publishers') + op.drop_table('javdb_movies') + op.drop_table('javdb_makers') + op.drop_table('javdb_actors') + op.drop_table('javbus_tags') + op.drop_table('javbus_studios') + op.drop_table('javbus_series') + op.drop_table('javbus_movies') + op.drop_table('javbus_labels') + op.drop_table('javbus_actors') + op.drop_table('iafd_task_log') + op.drop_table('iafd_studios') + op.drop_table('iafd_performers') + op.drop_table('iafd_meta_ethnic') + op.drop_table('iafd_distributors') + op.drop_table('clm_keywords') + op.drop_table('clm_index') + # ### end Alembic commands ### diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/versions/bce23e0d0c3a_initial_test_schema.py b/scrapy_proj/sqlalchemy/migrations/testdb/versions/bce23e0d0c3a_initial_test_schema.py new file mode 100644 index 0000000..7bcc928 --- /dev/null +++ b/scrapy_proj/sqlalchemy/migrations/testdb/versions/bce23e0d0c3a_initial_test_schema.py @@ -0,0 +1,32 @@ +"""Initial test schema + +Revision ID: bce23e0d0c3a +Revises: 86eea10972c0 +Create Date: 2025-07-29 19:54:54.268814 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'bce23e0d0c3a' +down_revision: Union[str, Sequence[str], None] = '86eea10972c0' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### diff --git a/scrapy_proj/sqlalchemy/models/scrapy.py b/scrapy_proj/sqlalchemy/models/scrapy.py new file mode 100644 index 0000000..c0e8f3f --- /dev/null +++ b/scrapy_proj/sqlalchemy/models/scrapy.py @@ -0,0 +1,85 @@ +from sqlalchemy import ForeignKey, Integer, REAL, String, Text, text +from typing import List, Optional + +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship + +class Base(DeclarativeBase): + pass + + +class ClmIndex(Base): + __tablename__ = 'clm_index' + + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + category: Mapped[Optional[str]] = mapped_column(Text) + title: Mapped[Optional[str]] = mapped_column(Text) + href: Mapped[Optional[str]] = mapped_column(Text, unique=True) + magnet_href: Mapped[Optional[str]] = mapped_column(Text) + size_text: Mapped[Optional[str]] = mapped_column(Text) + size_gb: Mapped[Optional[float]] = mapped_column(REAL) + heat: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + add_date: Mapped[Optional[str]] = mapped_column(Text) + last_down_date: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + clm_keywords_index: Mapped[List['ClmKeywordsIndex']] = relationship('ClmKeywordsIndex', back_populates='index') + + +class ClmKeywords(Base): + __tablename__ = 'clm_keywords' + + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + words: Mapped[Optional[str]] = mapped_column(Text, unique=True) + groups: Mapped[Optional[str]] = mapped_column(Text) + tags: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + index_count: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + + clm_keywords_index: Mapped[List['ClmKeywordsIndex']] = relationship('ClmKeywordsIndex', back_populates='words') + + +class Sis(Base): + __tablename__ = 'sis' + + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + plate_name: Mapped[Optional[str]] = mapped_column(Text) + title: Mapped[Optional[str]] = mapped_column(Text) + url: Mapped[Optional[str]] = mapped_column(Text, unique=True) + size_text: Mapped[Optional[str]] = mapped_column(Text) + size_gb: Mapped[Optional[float]] = mapped_column(REAL) + update_date: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + +class U3c3(Base): + __tablename__ = 'u3c3' + + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + category: Mapped[Optional[str]] = mapped_column(Text) + title: Mapped[Optional[str]] = mapped_column(Text) + url: Mapped[Optional[str]] = mapped_column(Text, unique=True) + torrent_url: Mapped[Optional[str]] = mapped_column(Text) + magnet_url: Mapped[Optional[str]] = mapped_column(Text) + size_text: Mapped[Optional[str]] = mapped_column(Text) + size_gb: Mapped[Optional[float]] = mapped_column(REAL) + update_date: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + +class ClmKeywordsIndex(Base): + __tablename__ = 'clm_keywords_index' + + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + words_id: Mapped[Optional[int]] = mapped_column(ForeignKey('clm_keywords.id')) + index_id: Mapped[Optional[int]] = mapped_column(ForeignKey('clm_index.id')) + wid_iid: Mapped[Optional[str]] = mapped_column(String(255)) + tags: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + index: Mapped[Optional['ClmIndex']] = relationship('ClmIndex', back_populates='clm_keywords_index') + words: Mapped[Optional['ClmKeywords']] = relationship('ClmKeywords', back_populates='clm_keywords_index') diff --git a/scrapy_proj/sqlalchemy/models/shared.py b/scrapy_proj/sqlalchemy/models/shared.py new file mode 100644 index 0000000..6496ea1 --- /dev/null +++ b/scrapy_proj/sqlalchemy/models/shared.py @@ -0,0 +1,670 @@ +from sqlalchemy import DateTime, ForeignKey, Index, Integer, REAL, String, Text, text +from typing import List, Optional + +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship +import datetime + +class Base(DeclarativeBase): + pass + + +class IafdDistributors(Base): + __tablename__ = 'iafd_distributors' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + details: Mapped[Optional[str]] = mapped_column(Text) + + iafd_movies: Mapped[List['IafdMovies']] = relationship('IafdMovies', back_populates='distributor') + + +class IafdMetaEthnic(Base): + __tablename__ = 'iafd_meta_ethnic' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + +class IafdPerformers(Base): + __tablename__ = 'iafd_performers' + + name: Mapped[str] = mapped_column(Text) + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + birth_year: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_astro_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_birth_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_ethnic_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + gender: Mapped[Optional[str]] = mapped_column(Text) + birthday: Mapped[Optional[str]] = mapped_column(Text) + astrology: Mapped[Optional[str]] = mapped_column(Text) + birthplace: Mapped[Optional[str]] = mapped_column(Text) + years_active: Mapped[Optional[str]] = mapped_column(Text) + ethnicity: Mapped[Optional[str]] = mapped_column(Text) + nationality: Mapped[Optional[str]] = mapped_column(Text) + hair_colors: Mapped[Optional[str]] = mapped_column(Text) + eye_color: Mapped[Optional[str]] = mapped_column(Text) + height_str: Mapped[Optional[str]] = mapped_column(Text) + weight_str: Mapped[Optional[str]] = mapped_column(Text) + measurements: Mapped[Optional[str]] = mapped_column(Text) + tattoos: Mapped[Optional[str]] = mapped_column(Text) + piercings: Mapped[Optional[str]] = mapped_column(Text) + fake_tits: Mapped[Optional[str]] = mapped_column(Text) + href: Mapped[Optional[str]] = mapped_column(Text, unique=True) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + weight: Mapped[Optional[int]] = mapped_column(Integer) + height: Mapped[Optional[int]] = mapped_column(Integer) + rating: Mapped[Optional[int]] = mapped_column(Integer) + movies_cnt: Mapped[Optional[int]] = mapped_column(Integer) + vixen_cnt: Mapped[Optional[int]] = mapped_column(Integer) + blacked_cnt: Mapped[Optional[int]] = mapped_column(Integer) + tushy_cnt: Mapped[Optional[int]] = mapped_column(Integer) + x_art_cnt: Mapped[Optional[int]] = mapped_column(Integer) + + iafd_performer_aliases: Mapped[List['IafdPerformerAliases']] = relationship('IafdPerformerAliases', back_populates='performer') + iafd_performer_urls: Mapped[List['IafdPerformerUrls']] = relationship('IafdPerformerUrls', back_populates='performer') + iafd_performers_movies: Mapped[List['IafdPerformersMovies']] = relationship('IafdPerformersMovies', back_populates='performer') + + +class IafdStudios(Base): + __tablename__ = 'iafd_studios' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + details: Mapped[Optional[str]] = mapped_column(Text) + + iafd_movies: Mapped[List['IafdMovies']] = relationship('IafdMovies', back_populates='studio') + + +class IafdTaskLog(Base): + __tablename__ = 'iafd_task_log' + + task_id: Mapped[int] = mapped_column(Integer, primary_key=True) + full_data_performers: Mapped[Optional[int]] = mapped_column(Integer) + total_performers: Mapped[Optional[int]] = mapped_column(Integer) + full_data_movies: Mapped[Optional[int]] = mapped_column(Integer) + total_movies: Mapped[Optional[int]] = mapped_column(Integer) + total_distributors: Mapped[Optional[int]] = mapped_column(Integer) + total_studios: Mapped[Optional[int]] = mapped_column(Integer) + task_status: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + +class JavbusActors(Base): + __tablename__ = 'javbus_actors' + + uncensored: Mapped[int] = mapped_column(Integer, server_default=text('0')) + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_actor_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + movies_cnt: Mapped[int] = mapped_column(Integer, server_default=text('0')) + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + ja_name: Mapped[Optional[str]] = mapped_column(Text) + zh_name: Mapped[Optional[str]] = mapped_column(Text) + en_name: Mapped[Optional[str]] = mapped_column(Text) + href: Mapped[Optional[str]] = mapped_column(Text, unique=True) + pic: Mapped[Optional[str]] = mapped_column(Text) + birth_date: Mapped[Optional[str]] = mapped_column(Text) + height: Mapped[Optional[str]] = mapped_column(Text) + breast_size: Mapped[Optional[str]] = mapped_column(Text) + measurements: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + javbus_actors_movies: Mapped[List['JavbusActorsMovies']] = relationship('JavbusActorsMovies', back_populates='actor') + + +class JavbusLabels(Base): + __tablename__ = 'javbus_labels' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[Optional[str]] = mapped_column(String(255)) + en_name: Mapped[Optional[str]] = mapped_column(String(255)) + ja_name: Mapped[Optional[str]] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + details: Mapped[Optional[str]] = mapped_column(Text) + uncensored: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + movies_cnt: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + magnet_cnt: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + + +class JavbusMovies(Base): + __tablename__ = 'javbus_movies' + + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + uncensored: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_actor_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_studios: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_labels: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_series: Mapped[int] = mapped_column(Integer, server_default=text('0')) + actors_cnt: Mapped[int] = mapped_column(Integer, server_default=text('0')) + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + href: Mapped[Optional[str]] = mapped_column(Text, unique=True) + title: Mapped[Optional[str]] = mapped_column(Text) + cover_url: Mapped[Optional[str]] = mapped_column(Text) + serial_number: Mapped[Optional[str]] = mapped_column(Text) + release_date: Mapped[Optional[str]] = mapped_column(Text) + duration: Mapped[Optional[str]] = mapped_column(Text) + studio_id: Mapped[Optional[int]] = mapped_column(Integer) + label_id: Mapped[Optional[int]] = mapped_column(Integer) + series_id: Mapped[Optional[int]] = mapped_column(Integer) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + javbus_actors_movies: Mapped[List['JavbusActorsMovies']] = relationship('JavbusActorsMovies', back_populates='movie') + javbus_movies_tags: Mapped[List['JavbusMoviesTags']] = relationship('JavbusMoviesTags', back_populates='movie') + + +class JavbusSeries(Base): + __tablename__ = 'javbus_series' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[Optional[str]] = mapped_column(String(255)) + en_name: Mapped[Optional[str]] = mapped_column(String(255)) + ja_name: Mapped[Optional[str]] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + details: Mapped[Optional[str]] = mapped_column(Text) + uncensored: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + movies_cnt: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + magnet_cnt: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + + +class JavbusStudios(Base): + __tablename__ = 'javbus_studios' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[Optional[str]] = mapped_column(String(255)) + en_name: Mapped[Optional[str]] = mapped_column(String(255)) + ja_name: Mapped[Optional[str]] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + details: Mapped[Optional[str]] = mapped_column(Text) + uncensored: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + movies_cnt: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + magnet_cnt: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + + +class JavbusTags(Base): + __tablename__ = 'javbus_tags' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[Optional[str]] = mapped_column(String(255)) + en_name: Mapped[Optional[str]] = mapped_column(String(255)) + ja_name: Mapped[Optional[str]] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + javbus_movies_tags: Mapped[List['JavbusMoviesTags']] = relationship('JavbusMoviesTags', back_populates='tag') + + +class JavdbActors(Base): + __tablename__ = 'javdb_actors' + + name: Mapped[str] = mapped_column(Text) + href: Mapped[str] = mapped_column(Text, unique=True) + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_actor_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + movies_cnt: Mapped[int] = mapped_column(Integer, server_default=text('0')) + uncensored: Mapped[int] = mapped_column(Integer, server_default=text('0')) + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + pic: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[datetime.datetime]] = mapped_column(DateTime, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[datetime.datetime]] = mapped_column(DateTime, server_default=text("datetime('now', 'localtime')")) + + javdb_actors_alias: Mapped[List['JavdbActorsAlias']] = relationship('JavdbActorsAlias', back_populates='actor') + javdb_actors_movies: Mapped[List['JavdbActorsMovies']] = relationship('JavdbActorsMovies', back_populates='actor') + + +class JavdbMakers(Base): + __tablename__ = 'javdb_makers' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + details: Mapped[Optional[str]] = mapped_column(Text) + from_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + + +class JavdbMovies(Base): + __tablename__ = 'javdb_movies' + + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_actor_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_makers: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_series: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_movie_publishers: Mapped[int] = mapped_column(Integer, server_default=text('0')) + uncensored: Mapped[int] = mapped_column(Integer, server_default=text('0')) + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + href: Mapped[Optional[str]] = mapped_column(Text, unique=True) + title: Mapped[Optional[str]] = mapped_column(Text) + cover_url: Mapped[Optional[str]] = mapped_column(Text) + serial_number: Mapped[Optional[str]] = mapped_column(Text) + release_date: Mapped[Optional[str]] = mapped_column(Text) + duration: Mapped[Optional[str]] = mapped_column(Text) + maker_id: Mapped[Optional[str]] = mapped_column(Text) + series_id: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + pub_id: Mapped[Optional[int]] = mapped_column(Integer) + + javdb_actors_movies: Mapped[List['JavdbActorsMovies']] = relationship('JavdbActorsMovies', back_populates='movie') + javdb_movies_tags: Mapped[List['JavdbMoviesTags']] = relationship('JavdbMoviesTags', back_populates='movie') + + +class JavdbPublishers(Base): + __tablename__ = 'javdb_publishers' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + details: Mapped[Optional[str]] = mapped_column(Text) + from_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + + +class JavdbSeries(Base): + __tablename__ = 'javdb_series' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + details: Mapped[Optional[str]] = mapped_column(Text) + from_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + from_movie_list: Mapped[Optional[int]] = mapped_column(Integer, server_default=text('0')) + + +class JavdbTags(Base): + __tablename__ = 'javdb_tags' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + javdb_movies_tags: Mapped[List['JavdbMoviesTags']] = relationship('JavdbMoviesTags', back_populates='tag') + + +class JavdbTaskLog(Base): + __tablename__ = 'javdb_task_log' + + task_id: Mapped[int] = mapped_column(Integer, primary_key=True) + full_data_actors: Mapped[Optional[int]] = mapped_column(Integer) + total_actors: Mapped[Optional[int]] = mapped_column(Integer) + full_data_movies: Mapped[Optional[int]] = mapped_column(Integer) + total_movies: Mapped[Optional[int]] = mapped_column(Integer) + total_makers: Mapped[Optional[int]] = mapped_column(Integer) + total_series: Mapped[Optional[int]] = mapped_column(Integer) + task_status: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + +class JavhdModels(Base): + __tablename__ = 'javhd_models' + + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + rank: Mapped[Optional[int]] = mapped_column(Integer) + ja_name: Mapped[Optional[str]] = mapped_column(Text) + zh_name: Mapped[Optional[str]] = mapped_column(Text) + en_name: Mapped[Optional[str]] = mapped_column(Text) + url: Mapped[Optional[str]] = mapped_column(Text, unique=True) + pic: Mapped[Optional[str]] = mapped_column(Text) + height: Mapped[Optional[str]] = mapped_column(Text) + weight: Mapped[Optional[str]] = mapped_column(Text) + breast_size: Mapped[Optional[str]] = mapped_column(Text) + breast_factor: Mapped[Optional[str]] = mapped_column(Text) + hair_color: Mapped[Optional[str]] = mapped_column(Text) + eye_color: Mapped[Optional[str]] = mapped_column(Text) + birth_date: Mapped[Optional[str]] = mapped_column(Text) + ethnicity: Mapped[Optional[str]] = mapped_column(Text) + birth_place: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + +class PboxActors(Base): + __tablename__ = 'pbox_actors' + + name: Mapped[str] = mapped_column(Text) + movies_cnt: Mapped[int] = mapped_column(Integer, server_default=text('0')) + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + href: Mapped[Optional[str]] = mapped_column(Text, unique=True) + gender: Mapped[Optional[str]] = mapped_column(Text) + age: Mapped[Optional[int]] = mapped_column(Integer) + nationality: Mapped[Optional[str]] = mapped_column(Text) + country: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + pbox_actor_aliases: Mapped[List['PboxActorAliases']] = relationship('PboxActorAliases', back_populates='actor') + pbox_actors_movies: Mapped[List['PboxActorsMovies']] = relationship('PboxActorsMovies', back_populates='actor') + + +class PboxStudios(Base): + __tablename__ = 'pbox_studios' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + label_id: Mapped[int] = mapped_column(Integer, server_default=text('0')) + scene_count: Mapped[int] = mapped_column(Integer, server_default=text('0')) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + description: Mapped[Optional[str]] = mapped_column(Text) + + pbox_movies: Mapped[List['PboxMovies']] = relationship('PboxMovies', back_populates='studio') + + +class PboxTags(Base): + __tablename__ = 'pbox_tags' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + href: Mapped[Optional[str]] = mapped_column(String(255)) + tag_id: Mapped[Optional[int]] = mapped_column(Integer) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + pbox_movies_tags: Mapped[List['PboxMoviesTags']] = relationship('PboxMoviesTags', back_populates='tag') + + +class ThelordofpornActress(Base): + __tablename__ = 'thelordofporn_actress' + + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + id: Mapped[Optional[int]] = mapped_column(Integer, primary_key=True) + pornstar: Mapped[Optional[str]] = mapped_column(Text) + rating: Mapped[Optional[float]] = mapped_column(REAL) + rank: Mapped[Optional[int]] = mapped_column(Integer) + votes: Mapped[Optional[int]] = mapped_column(Integer) + href: Mapped[Optional[str]] = mapped_column(Text, unique=True) + career_start: Mapped[Optional[str]] = mapped_column(Text) + measurements: Mapped[Optional[str]] = mapped_column(Text) + born: Mapped[Optional[str]] = mapped_column(Text) + height: Mapped[Optional[str]] = mapped_column(Text) + weight: Mapped[Optional[str]] = mapped_column(Text) + date_modified: Mapped[Optional[str]] = mapped_column(Text) + global_rank: Mapped[Optional[int]] = mapped_column(Integer) + weekly_rank: Mapped[Optional[int]] = mapped_column(Integer) + last_month_rating: Mapped[Optional[float]] = mapped_column(REAL) + current_rating: Mapped[Optional[float]] = mapped_column(REAL) + total_votes: Mapped[Optional[int]] = mapped_column(Integer) + birth_date: Mapped[Optional[str]] = mapped_column(Text) + birth_year: Mapped[Optional[str]] = mapped_column(Text) + birth_place: Mapped[Optional[str]] = mapped_column(Text) + height_ft: Mapped[Optional[str]] = mapped_column(Text) + height_cm: Mapped[Optional[str]] = mapped_column(Text) + weight_lbs: Mapped[Optional[str]] = mapped_column(Text) + weight_kg: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + thelordofporn_alias: Mapped[List['ThelordofpornAlias']] = relationship('ThelordofpornAlias', back_populates='actress') + + +class IafdMovies(Base): + __tablename__ = 'iafd_movies' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + release_year: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_performer_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_dist_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + from_stu_list: Mapped[int] = mapped_column(Integer, server_default=text('0')) + title: Mapped[Optional[str]] = mapped_column(String(255)) + minutes: Mapped[Optional[str]] = mapped_column(String(255)) + distributor_id: Mapped[Optional[int]] = mapped_column(ForeignKey('iafd_distributors.id')) + studio_id: Mapped[Optional[int]] = mapped_column(ForeignKey('iafd_studios.id')) + release_date: Mapped[Optional[str]] = mapped_column(String(255)) + added_to_IAFD_date: Mapped[Optional[str]] = mapped_column(String(255)) + all_girl: Mapped[Optional[str]] = mapped_column(String(255)) + all_male: Mapped[Optional[str]] = mapped_column(String(255)) + compilation: Mapped[Optional[str]] = mapped_column(String(255)) + webscene: Mapped[Optional[str]] = mapped_column(String(255)) + director_id: Mapped[Optional[int]] = mapped_column(Integer) + href: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + distributor: Mapped[Optional['IafdDistributors']] = relationship('IafdDistributors', back_populates='iafd_movies') + studio: Mapped[Optional['IafdStudios']] = relationship('IafdStudios', back_populates='iafd_movies') + iafd_movies_appers_in: Mapped[List['IafdMoviesAppersIn']] = relationship('IafdMoviesAppersIn', foreign_keys='[IafdMoviesAppersIn.appears_in_id]', back_populates='appears_in') + iafd_movies_appers_in_: Mapped[List['IafdMoviesAppersIn']] = relationship('IafdMoviesAppersIn', foreign_keys='[IafdMoviesAppersIn.movie_id]', back_populates='movie') + iafd_performers_movies: Mapped[List['IafdPerformersMovies']] = relationship('IafdPerformersMovies', back_populates='movie') + + +class IafdPerformerAliases(Base): + __tablename__ = 'iafd_performer_aliases' + + performer_id: Mapped[int] = mapped_column(ForeignKey('iafd_performers.id'), primary_key=True) + alias: Mapped[str] = mapped_column(String(255), primary_key=True) + + performer: Mapped['IafdPerformers'] = relationship('IafdPerformers', back_populates='iafd_performer_aliases') + + +class IafdPerformerUrls(Base): + __tablename__ = 'iafd_performer_urls' + + performer_id: Mapped[int] = mapped_column(ForeignKey('iafd_performers.id'), primary_key=True) + position: Mapped[str] = mapped_column(String(255), primary_key=True) + url: Mapped[str] = mapped_column(String(255), primary_key=True) + + performer: Mapped['IafdPerformers'] = relationship('IafdPerformers', back_populates='iafd_performer_urls') + + +class JavbusActorsMovies(Base): + __tablename__ = 'javbus_actors_movies' + __table_args__ = ( + Index('idx_actor_movie_actor_id', 'actor_id'), + ) + + actor_id: Mapped[Optional[int]] = mapped_column(ForeignKey('javbus_actors.id'), primary_key=True) + movie_id: Mapped[Optional[int]] = mapped_column(ForeignKey('javbus_movies.id'), primary_key=True) + tags: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + actor: Mapped[Optional['JavbusActors']] = relationship('JavbusActors', back_populates='javbus_actors_movies') + movie: Mapped[Optional['JavbusMovies']] = relationship('JavbusMovies', back_populates='javbus_actors_movies') + + +class JavbusMoviesTags(Base): + __tablename__ = 'javbus_movies_tags' + + movie_id: Mapped[Optional[int]] = mapped_column(ForeignKey('javbus_movies.id'), primary_key=True) + tag_id: Mapped[Optional[int]] = mapped_column(ForeignKey('javbus_tags.id'), primary_key=True) + tags: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + movie: Mapped[Optional['JavbusMovies']] = relationship('JavbusMovies', back_populates='javbus_movies_tags') + tag: Mapped[Optional['JavbusTags']] = relationship('JavbusTags', back_populates='javbus_movies_tags') + + +class JavdbActorsAlias(Base): + __tablename__ = 'javdb_actors_alias' + + actor_id: Mapped[int] = mapped_column(ForeignKey('javdb_actors.id', ondelete='CASCADE'), primary_key=True) + alias: Mapped[str] = mapped_column(Text, primary_key=True) + created_at: Mapped[Optional[datetime.datetime]] = mapped_column(DateTime, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[datetime.datetime]] = mapped_column(DateTime, server_default=text("datetime('now', 'localtime')")) + + actor: Mapped['JavdbActors'] = relationship('JavdbActors', back_populates='javdb_actors_alias') + + +class JavdbActorsMovies(Base): + __tablename__ = 'javdb_actors_movies' + + actor_id: Mapped[Optional[int]] = mapped_column(ForeignKey('javdb_actors.id'), primary_key=True) + movie_id: Mapped[Optional[int]] = mapped_column(ForeignKey('javdb_movies.id'), primary_key=True) + tags: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text) + + actor: Mapped[Optional['JavdbActors']] = relationship('JavdbActors', back_populates='javdb_actors_movies') + movie: Mapped[Optional['JavdbMovies']] = relationship('JavdbMovies', back_populates='javdb_actors_movies') + + +class JavdbMoviesTags(Base): + __tablename__ = 'javdb_movies_tags' + + movie_id: Mapped[Optional[int]] = mapped_column(ForeignKey('javdb_movies.id'), primary_key=True) + tag_id: Mapped[Optional[int]] = mapped_column(ForeignKey('javdb_tags.id'), primary_key=True) + tags: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + movie: Mapped[Optional['JavdbMovies']] = relationship('JavdbMovies', back_populates='javdb_movies_tags') + tag: Mapped[Optional['JavdbTags']] = relationship('JavdbTags', back_populates='javdb_movies_tags') + + +class PboxActorAliases(Base): + __tablename__ = 'pbox_actor_aliases' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + actor_id: Mapped[int] = mapped_column(ForeignKey('pbox_actors.id')) + alias: Mapped[str] = mapped_column(String(255)) + actor_alias: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + actor: Mapped['PboxActors'] = relationship('PboxActors', back_populates='pbox_actor_aliases') + + +class PboxMovies(Base): + __tablename__ = 'pbox_movies' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + is_full_data: Mapped[int] = mapped_column(Integer, server_default=text('0')) + href: Mapped[Optional[str]] = mapped_column(String(255)) + title: Mapped[Optional[str]] = mapped_column(String(255)) + movie_id: Mapped[Optional[int]] = mapped_column(Integer) + content_id: Mapped[Optional[int]] = mapped_column(Integer) + duration: Mapped[Optional[str]] = mapped_column(String(255)) + publish_date: Mapped[Optional[str]] = mapped_column(String(255)) + release_date: Mapped[Optional[str]] = mapped_column(String(255)) + studio_id: Mapped[Optional[int]] = mapped_column(ForeignKey('pbox_studios.id')) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + studio: Mapped[Optional['PboxStudios']] = relationship('PboxStudios', back_populates='pbox_movies') + pbox_actors_movies: Mapped[List['PboxActorsMovies']] = relationship('PboxActorsMovies', back_populates='movie') + pbox_movies_alts: Mapped[List['PboxMoviesAlts']] = relationship('PboxMoviesAlts', foreign_keys='[PboxMoviesAlts.max_mov_id]', back_populates='max_mov') + pbox_movies_alts_: Mapped[List['PboxMoviesAlts']] = relationship('PboxMoviesAlts', foreign_keys='[PboxMoviesAlts.min_mov_id]', back_populates='min_mov') + pbox_movies_tags: Mapped[List['PboxMoviesTags']] = relationship('PboxMoviesTags', back_populates='movie') + + +class ThelordofpornAlias(Base): + __tablename__ = 'thelordofporn_alias' + + actress_id: Mapped[int] = mapped_column(ForeignKey('thelordofporn_actress.id', ondelete='CASCADE'), primary_key=True) + alias: Mapped[str] = mapped_column(Text, primary_key=True) + updated_at: Mapped[Optional[str]] = mapped_column(Text) + + actress: Mapped['ThelordofpornActress'] = relationship('ThelordofpornActress', back_populates='thelordofporn_alias') + + +class IafdMoviesAppersIn(Base): + __tablename__ = 'iafd_movies_appers_in' + + movie_id: Mapped[Optional[int]] = mapped_column(ForeignKey('iafd_movies.id'), primary_key=True) + appears_in_id: Mapped[Optional[int]] = mapped_column(ForeignKey('iafd_movies.id'), primary_key=True) + gradation: Mapped[Optional[int]] = mapped_column(Integer) + notes: Mapped[Optional[str]] = mapped_column(String(255)) + + appears_in: Mapped[Optional['IafdMovies']] = relationship('IafdMovies', foreign_keys=[appears_in_id], back_populates='iafd_movies_appers_in') + movie: Mapped[Optional['IafdMovies']] = relationship('IafdMovies', foreign_keys=[movie_id], back_populates='iafd_movies_appers_in_') + + +class IafdPerformersMovies(Base): + __tablename__ = 'iafd_performers_movies' + __table_args__ = ( + Index('idx_iafd_performers_movies_performer_id', 'performer_id'), + ) + + performer_id: Mapped[Optional[int]] = mapped_column(ForeignKey('iafd_performers.id'), primary_key=True) + movie_id: Mapped[Optional[int]] = mapped_column(ForeignKey('iafd_movies.id'), primary_key=True) + role: Mapped[Optional[str]] = mapped_column(String(255)) + notes: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + movie: Mapped[Optional['IafdMovies']] = relationship('IafdMovies', back_populates='iafd_performers_movies') + performer: Mapped[Optional['IafdPerformers']] = relationship('IafdPerformers', back_populates='iafd_performers_movies') + + +class PboxActorsMovies(Base): + __tablename__ = 'pbox_actors_movies' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + actor_id: Mapped[Optional[int]] = mapped_column(ForeignKey('pbox_actors.id')) + movie_id: Mapped[Optional[int]] = mapped_column(ForeignKey('pbox_movies.id')) + actor_mov: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + tags: Mapped[Optional[str]] = mapped_column(Text) + + actor: Mapped[Optional['PboxActors']] = relationship('PboxActors', back_populates='pbox_actors_movies') + movie: Mapped[Optional['PboxMovies']] = relationship('PboxMovies', back_populates='pbox_actors_movies') + + +class PboxMoviesAlts(Base): + __tablename__ = 'pbox_movies_alts' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + min_mov_id: Mapped[Optional[int]] = mapped_column(ForeignKey('pbox_movies.id')) + max_mov_id: Mapped[Optional[int]] = mapped_column(ForeignKey('pbox_movies.id')) + min_max: Mapped[Optional[str]] = mapped_column(String(255)) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + max_mov: Mapped[Optional['PboxMovies']] = relationship('PboxMovies', foreign_keys=[max_mov_id], back_populates='pbox_movies_alts') + min_mov: Mapped[Optional['PboxMovies']] = relationship('PboxMovies', foreign_keys=[min_mov_id], back_populates='pbox_movies_alts_') + + +class PboxMoviesTags(Base): + __tablename__ = 'pbox_movies_tags' + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + movie_id: Mapped[Optional[int]] = mapped_column(ForeignKey('pbox_movies.id')) + tag_id: Mapped[Optional[int]] = mapped_column(ForeignKey('pbox_tags.id')) + movid_tagid: Mapped[Optional[str]] = mapped_column(String(255)) + tags: Mapped[Optional[str]] = mapped_column(Text) + created_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + updated_at: Mapped[Optional[str]] = mapped_column(Text, server_default=text("datetime('now', 'localtime')")) + + movie: Mapped[Optional['PboxMovies']] = relationship('PboxMovies', back_populates='pbox_movies_tags') + tag: Mapped[Optional['PboxTags']] = relationship('PboxTags', back_populates='pbox_movies_tags') diff --git a/scrapy_proj/sqlalchemy/readme.txt b/scrapy_proj/sqlalchemy/readme.txt new file mode 100644 index 0000000..526ca90 --- /dev/null +++ b/scrapy_proj/sqlalchemy/readme.txt @@ -0,0 +1,55 @@ +# 从数据库生成模型类 +sqlacodegen sqlite:////root/sharedata/sqlite/shared.db > models/shared.py +sqlacodegen sqlite:////root/sharedata/sqlite/scrapy.db > models/scrapy.py + +# 初始化 +alembic init migrations/shared_comm +alembic init migrations/scrapy +alembic init migrations/testdb + +# 同步修改到数据库(读取 models/shared.py ) +./scripts/sync_shared_comm.sh + +./scripts/reset_testdb.sh + +### 对视图支持不好,主要是视图的字段没有类型,所以在导入导出时会出错,慎用! + + +------------------------- 清理掉无法处理的字段 -------------------- +PRAGMA foreign_keys = OFF; -- 禁用外键检查 + +-- 检查字段是否已删除 +PRAGMA table_info(javdb_series); -- 应看不到parent_id字段 + +-- 验证数据完整性 +SELECT COUNT(*), MIN(id), MAX(id) FROM javdb_series; -- 数量应与操作前一致 + +-- 5. 处理 javdb_series 表 +CREATE TABLE "javdb_series_new" ( + `id` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + `name` VARCHAR(255) NOT NULL, + `href` VARCHAR(255) UNIQUE, + `created_at` TEXT DEFAULT (datetime('now', 'localtime')), + `updated_at` TEXT DEFAULT (datetime('now', 'localtime')), + `details` TEXT, + from_list INTEGER DEFAULT (0), + from_movie_list INTEGER DEFAULT (0) +); + +INSERT INTO javdb_series_new (id, name, href, created_at, updated_at, details, from_list, from_movie_list) +SELECT id, name, href, created_at, updated_at, details, from_list, from_movie_list +FROM javdb_series; + +DROP TABLE javdb_series; +ALTER TABLE javdb_series_new RENAME TO javdb_series; + + + +-- 检查字段是否已删除 +PRAGMA table_info(javdb_series); -- 应看不到parent_id字段 + +-- 验证数据完整性 +SELECT COUNT(*), MIN(id), MAX(id) FROM javdb_series; -- 数量应与操作前一致 + + +PRAGMA foreign_keys = ON; -- 恢复外键检查 \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/requirements.txt b/scrapy_proj/sqlalchemy/requirements.txt new file mode 100644 index 0000000..56f0c4a --- /dev/null +++ b/scrapy_proj/sqlalchemy/requirements.txt @@ -0,0 +1,6 @@ +# requirements.txt +sqlalchemy>=2.0.0 +alembic>=1.12.0 +sqlacodegen>=3.0.0 + +# pip install -r requirements.txt \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/scripts/reset_testdb.sh b/scrapy_proj/sqlalchemy/scripts/reset_testdb.sh new file mode 100755 index 0000000..328aa4f --- /dev/null +++ b/scrapy_proj/sqlalchemy/scripts/reset_testdb.sh @@ -0,0 +1,14 @@ +#!/bin/bash +cd $(dirname $0)/.. + +# 删除测试库文件 +#rm -f /root/sharedata/sqlite/test.db + +# 重新创建并应用最新迁移 +alembic -c migrations/testdb/alembic.ini revision --autogenerate -m "Initial test schema" +alembic -c migrations/testdb/alembic.ini upgrade head + +# 回滚数据库A到上一版本 +# alembic -c migrations/testdb/alembic.ini downgrade -1 + +echo "测试库已重置并生成最新表结构" \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/scripts/sync_scrapy.sh b/scrapy_proj/sqlalchemy/scripts/sync_scrapy.sh new file mode 100755 index 0000000..21b011c --- /dev/null +++ b/scrapy_proj/sqlalchemy/scripts/sync_scrapy.sh @@ -0,0 +1,7 @@ +#!/bin/bash +cd $(dirname $0)/.. + +alembic -c migrations/scrapy/alembic.ini revision --autogenerate -m "Auto update from scrapy" +alembic -c migrations/scrapy/alembic.ini upgrade head + +echo "数据库 scrapy 同步完成" \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/scripts/sync_shared_comm.sh b/scrapy_proj/sqlalchemy/scripts/sync_shared_comm.sh new file mode 100755 index 0000000..89a86e0 --- /dev/null +++ b/scrapy_proj/sqlalchemy/scripts/sync_shared_comm.sh @@ -0,0 +1,10 @@ +#!/bin/bash +cd $(dirname $0)/.. # 切换到tools目录 + +# 生成迁移脚本 +alembic -c migrations/shared_comm/alembic.ini revision --autogenerate -m "Auto update from shared_comm" + +# 执行迁移 +alembic -c migrations/shared_comm/alembic.ini upgrade head + +echo "数据库 shared 同步完成" \ No newline at end of file