From f43cd53159d28891d684577d0356571f97602391 Mon Sep 17 00:00:00 2001 From: oscarz Date: Mon, 17 Mar 2025 11:08:13 +0800 Subject: [PATCH] modify scripts --- gitignore | 28 +- .../input => input}/index/000300cons.csv | 0 .../input => input}/index/000510cons.csv | 0 .../input => input}/index/000685cons.csv | 0 .../input => input}/index/930050cons.csv | 0 .../input => input}/index/931643cons.csv | 0 ...SP-500-Index-Constituents-Sept-23-2024.csv | 0 {stockapp/reports_em => reports_em}/config.py | 0 {stockapp/reports_em => reports_em}/deploy.sh | 0 .../reports_em => reports_em}/em_reports.py | 0 {stockapp/reports_em => reports_em}/fetch.py | 0 .../reports_em => reports_em}/sqlite_utils.py | 0 {stockapp/reports_em => reports_em}/utils.py | 0 {stockapp/result => result}/yield_results.csv | 0 scripts/aabook/aabook_fetch.py | 471 - scripts/aabook/config.py | 31 - scripts/aabook/down_list.py | 126 - .../aabook/meta/aabook_cursor_2024-10-01.txt | 1 - .../aabook/meta/aabook_details_2000-01-01.txt | 9 - .../aabook/meta/aabook_details_2024-10-01.txt | 59 - scripts/aabook/meta/aabook_details_all.txt | 5300 -- .../aabook/meta/aabook_list_2000-01-01.txt | 5320 -- .../aabook/meta/aabook_list_2024-10-01.txt | 59 - scripts/aabook/tools_diff.py | 64 - scripts/aabook/tools_dir.py | 66 - scripts/aabook/tools_other.py | 61 - scripts/iafd/merge/auto_tag.py | 101 - scripts/iafd/merge/json2csv.py | 72 - scripts/iafd/merge/url_match.py | 120 - scripts/iafd/src/config.py | 86 - scripts/iafd/src/fetch.py | 411 - scripts/iafd/src/iafd_scraper.py | 562 - scripts/iafd/src/load.py | 107 - scripts/iafd/src/sqlite_utils.py | 848 - scripts/iafd/src/utils.py | 101 - scripts/iafd/src_json/config.py | 26 - scripts/iafd/src_json/movie_detail_fetch.py | 334 - scripts/iafd/src_json/movie_list_fetch.py | 255 - scripts/iafd/src_json/performers_details.py | 393 - .../iafd/src_json/performers_list_astro.py | 140 - .../iafd/src_json/performers_list_birth.py | 152 - .../iafd/src_json/performers_list_ethnic.py | 166 - .../iafd/src_json/performers_list_merge.py | 120 - scripts/iafd/tools/data_merge.py | 236 - scripts/iafd/tools/iafd_scrape.py | 163 - scripts/iafd/tools/stashdb_merge.py | 90 - scripts/javdb/get_javdb.py | 132 - scripts/javdb/javdb_tmp/CWPBD_all.txt | 161 - scripts/javdb/javdb_tmp/DRGBD_all.txt | 23 - scripts/javdb/javdb_tmp/DSAMBD_all.txt | 39 - scripts/javdb/javdb_tmp/LAFBD_all.txt | 87 - scripts/javdb/javdb_tmp/MKBD_all.txt | 173 - scripts/javdb/javdb_tmp/S2MBD_all.txt | 279 - scripts/javdb/javdb_tmp/SKYHD_all.txt | 173 - scripts/javdb/javdb_tmp/SMBD_all.txt | 220 - scripts/javdb/javdb_tmp/all.txt | 1148 - scripts/javdb/src/config.py | 85 - scripts/javdb/src/fetch.py | 294 - scripts/javdb/src/scraper.py | 504 - scripts/javdb/src/sqlite_utils.py | 599 - scripts/javdb/src/utils.py | 18 - scripts/javhd/list_fetch.py | 108 - scripts/javhd/list_format.py | 119 - scripts/javhd/model_fetch.py | 176 - scripts/javhd/result/models.csv | 2648 - scripts/javhd/result/models.json | 21178 ------ scripts/javhd/result/models_detail.csv | 2648 - scripts/javhd/result/models_detail.json | 45001 ------------- scripts/javhd/tools.py | 100 - scripts/pornhub/cmd.py | 92 - scripts/pornhub/config.py | 31 - scripts/pornhub/custom_pornhub.py | 76 - scripts/pornhub/get_list.py | 179 - scripts/pornhub/sort.py | 30 - scripts/pornhub/test.py | 35 - scripts/schema.sql | 315 - scripts/thelordofporn/actress_fetch.py | 220 - scripts/thelordofporn/config.py | 31 - scripts/thelordofporn/list_fetch.py | 133 - .../thelordofporn/result/actress_detail.csv | 2084 - .../thelordofporn/result/actress_detail.json | 56243 ---------------- scripts/thelordofporn/result/actresses.csv | 2084 - scripts/thelordofporn/result/actresses.json | 14583 ---- .../result/top_pornstars_list.csv | 510 - .../thelordofporn/result/top_scenes_list.csv | 902 - scripts/thelordofporn/tools.py | 166 - scripts/thelordofporn/top_scenes.py | 205 - scripts/u9a9/get_u9a9.py | 117 - scripts/vixen_group/blacked-actress.py | 30 - scripts/vixen_group/blacked-format.py | 162 - .../vixen_group/formatted/blacked-list.txt | 643 - scripts/vixen_group/formatted/tushy-list.txt | 569 - scripts/vixen_group/history/actress.txt | 292 - .../vixen_group/history/blacked-actress.txt | 429 - scripts/vixen_group/history/blacked-list.txt | 643 - scripts/vixen_group/history/prompt.txt | 80 - scripts/vixen_group/history/tushy-actress.txt | 362 - scripts/vixen_group/history/tushy-list.txt | 569 - .../vixen_group/history/tvb-actress-all.txt | 698 - .../vixen_group/input_files/blacked-all.txt | 643 - scripts/vixen_group/input_files/tushy-raw.txt | 569 - scripts/vixen_group/input_files/vixen-all.txt | 474 - .../vixen_group/result/blacked-actress.txt | 428 - scripts/vixen_group/result/tushy-actress.txt | 362 - .../vixen_group/result/tvb-actress-all.txt | 697 - scripts/vixen_group/result/vixen-actress.txt | 292 - scripts/vixen_group/tushy-actress.py | 30 - scripts/vixen_group/tushy-format.py | 69 - scripts/vixen_group/tvb-actress.py | 80 - scripts/vixen_group/vixen-actress.py | 30 - {stockapp/shell => shell}/mysql-ddl.txt | 0 {stockapp/shell => shell}/stat.sql | 0 stockapp/schema.sql => sqlite_schema.sql | 0 {stockapp/src => src}/bak_stat_growth_rate.py | 0 {stockapp/src => src}/config.py | 0 {stockapp/src => src}/crawling/__init__.py | 0 {stockapp/src => src}/crawling/fund_etf_em.py | 0 {stockapp/src => src}/crawling/stock_cpbd.py | 0 .../src => src}/crawling/stock_dzjy_em.py | 0 .../src => src}/crawling/stock_fhps_em.py | 0 .../src => src}/crawling/stock_fund_em.py | 0 .../src => src}/crawling/stock_hist_em.py | 0 .../src => src}/crawling/stock_lhb_em.py | 0 .../src => src}/crawling/stock_lhb_sina.py | 0 .../src => src}/crawling/stock_selection.py | 0 .../src => src}/crawling/trade_date_hist.py | 0 .../src => src}/cursor/his_kline_em_codes.txt | 0 .../cursor/his_kline_em_done_codes.txt | 0 {stockapp/src => src}/get_futu_rehb.py | 0 {stockapp/src => src}/get_his_kline_em.py | 0 {stockapp/src => src}/get_hs300_his_kline.py | 0 {stockapp/src => src}/get_market_snapshot.py | 0 {stockapp/src => src}/get_plat_list.py | 0 {stockapp/src => src}/get_sp500_his_kline.py | 0 .../src => src}/sample/get_futu_curr_kline.py | 0 .../src => src}/sample/get_futu_snapshot.py | 0 .../src => src}/sample/get_history_kline.py | 0 .../src => src}/sample/get_personal_list.py | 0 {stockapp/src => src}/sample/get_plat_list.py | 0 .../src => src}/sample/get_plat_stock.py | 0 {stockapp/src => src}/sample/get_rehab.py | 0 {stockapp/src => src}/sample/get_yh_kline.py | 0 .../sample/get_yh_kline_download.py | 0 {stockapp/src => src}/stat_adjust_kline.py | 0 {stockapp/src => src}/stat_growth.py | 0 {stockapp/src => src}/stat_growth_em.py | 0 .../src => src}/stat_sp500_adjust_kline.py | 0 {stockapp/src => src}/stat_yield_rate.py | 0 {stockapp/src => src}/test_hs300_quant.py | 0 {stockapp/src => src}/test_quant.py | 0 {tushare-stock => tushare}/Dockerfile | 0 {tushare-stock => tushare}/db/ddl.sql | 0 {tushare-stock => tushare}/db/sql-query.sh | 0 {tushare-stock => tushare}/docker-compose.yml | 0 {tushare-stock => tushare}/kechuang50.txt | 0 {tushare-stock => tushare}/mod/00698.xlsx | Bin {tushare-stock => tushare}/mod/HKStock.py | 0 {tushare-stock => tushare}/mod/Template.xlsx | Bin {tushare-stock => tushare}/mod/main.py | 0 {tushare-stock => tushare}/readme.txt | 0 {tushare-stock => tushare}/requirements.txt | 0 {tushare-stock => tushare}/src/balance.log | 0 {tushare-stock => tushare}/src/cashflow.log | 0 {tushare-stock => tushare}/src/config.py | 0 {tushare-stock => tushare}/src/db.sql | 0 {tushare-stock => tushare}/src/income.log | 0 {tushare-stock => tushare}/src/indicator.log | 0 .../src/job_ods_hs_base_ipo.py | 0 .../src/job_ods_hs_base_list.py | 0 .../src/job_ods_hs_daily_basic.py | 0 .../src/job_ods_hs_finance_balance.py | 0 .../src/job_ods_hs_finance_cashflow.py | 0 .../src/job_ods_hs_finance_income.py | 0 .../src/job_ods_hs_finance_indicator.py | 0 .../src/job_ods_hs_indicator_list.py | 0 {tushare-stock => tushare}/src/test.py | 0 .../港股接口(eastmoney).txt | 0 177 files changed, 5 insertions(+), 178173 deletions(-) rename {stockapp/input => input}/index/000300cons.csv (100%) rename {stockapp/input => input}/index/000510cons.csv (100%) rename {stockapp/input => input}/index/000685cons.csv (100%) rename {stockapp/input => input}/index/930050cons.csv (100%) rename {stockapp/input => input}/index/931643cons.csv (100%) rename {stockapp/input => input}/index/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv (100%) rename {stockapp/reports_em => reports_em}/config.py (100%) rename {stockapp/reports_em => reports_em}/deploy.sh (100%) rename {stockapp/reports_em => reports_em}/em_reports.py (100%) rename {stockapp/reports_em => reports_em}/fetch.py (100%) rename {stockapp/reports_em => reports_em}/sqlite_utils.py (100%) rename {stockapp/reports_em => reports_em}/utils.py (100%) rename {stockapp/result => result}/yield_results.csv (100%) delete mode 100644 scripts/aabook/aabook_fetch.py delete mode 100644 scripts/aabook/config.py delete mode 100644 scripts/aabook/down_list.py delete mode 100644 scripts/aabook/meta/aabook_cursor_2024-10-01.txt delete mode 100644 scripts/aabook/meta/aabook_details_2000-01-01.txt delete mode 100644 scripts/aabook/meta/aabook_details_2024-10-01.txt delete mode 100644 scripts/aabook/meta/aabook_details_all.txt delete mode 100644 scripts/aabook/meta/aabook_list_2000-01-01.txt delete mode 100644 scripts/aabook/meta/aabook_list_2024-10-01.txt delete mode 100644 scripts/aabook/tools_diff.py delete mode 100644 scripts/aabook/tools_dir.py delete mode 100644 scripts/aabook/tools_other.py delete mode 100644 scripts/iafd/merge/auto_tag.py delete mode 100644 scripts/iafd/merge/json2csv.py delete mode 100644 scripts/iafd/merge/url_match.py delete mode 100644 scripts/iafd/src/config.py delete mode 100644 scripts/iafd/src/fetch.py delete mode 100644 scripts/iafd/src/iafd_scraper.py delete mode 100644 scripts/iafd/src/load.py delete mode 100644 scripts/iafd/src/sqlite_utils.py delete mode 100644 scripts/iafd/src/utils.py delete mode 100644 scripts/iafd/src_json/config.py delete mode 100644 scripts/iafd/src_json/movie_detail_fetch.py delete mode 100644 scripts/iafd/src_json/movie_list_fetch.py delete mode 100644 scripts/iafd/src_json/performers_details.py delete mode 100644 scripts/iafd/src_json/performers_list_astro.py delete mode 100644 scripts/iafd/src_json/performers_list_birth.py delete mode 100644 scripts/iafd/src_json/performers_list_ethnic.py delete mode 100644 scripts/iafd/src_json/performers_list_merge.py delete mode 100644 scripts/iafd/tools/data_merge.py delete mode 100644 scripts/iafd/tools/iafd_scrape.py delete mode 100644 scripts/iafd/tools/stashdb_merge.py delete mode 100644 scripts/javdb/get_javdb.py delete mode 100644 scripts/javdb/javdb_tmp/CWPBD_all.txt delete mode 100644 scripts/javdb/javdb_tmp/DRGBD_all.txt delete mode 100644 scripts/javdb/javdb_tmp/DSAMBD_all.txt delete mode 100644 scripts/javdb/javdb_tmp/LAFBD_all.txt delete mode 100644 scripts/javdb/javdb_tmp/MKBD_all.txt delete mode 100644 scripts/javdb/javdb_tmp/S2MBD_all.txt delete mode 100644 scripts/javdb/javdb_tmp/SKYHD_all.txt delete mode 100644 scripts/javdb/javdb_tmp/SMBD_all.txt delete mode 100644 scripts/javdb/javdb_tmp/all.txt delete mode 100644 scripts/javdb/src/config.py delete mode 100644 scripts/javdb/src/fetch.py delete mode 100644 scripts/javdb/src/scraper.py delete mode 100644 scripts/javdb/src/sqlite_utils.py delete mode 100644 scripts/javdb/src/utils.py delete mode 100644 scripts/javhd/list_fetch.py delete mode 100644 scripts/javhd/list_format.py delete mode 100644 scripts/javhd/model_fetch.py delete mode 100644 scripts/javhd/result/models.csv delete mode 100644 scripts/javhd/result/models.json delete mode 100644 scripts/javhd/result/models_detail.csv delete mode 100644 scripts/javhd/result/models_detail.json delete mode 100644 scripts/javhd/tools.py delete mode 100644 scripts/pornhub/cmd.py delete mode 100644 scripts/pornhub/config.py delete mode 100644 scripts/pornhub/custom_pornhub.py delete mode 100644 scripts/pornhub/get_list.py delete mode 100644 scripts/pornhub/sort.py delete mode 100644 scripts/pornhub/test.py delete mode 100644 scripts/schema.sql delete mode 100644 scripts/thelordofporn/actress_fetch.py delete mode 100644 scripts/thelordofporn/config.py delete mode 100644 scripts/thelordofporn/list_fetch.py delete mode 100644 scripts/thelordofporn/result/actress_detail.csv delete mode 100644 scripts/thelordofporn/result/actress_detail.json delete mode 100644 scripts/thelordofporn/result/actresses.csv delete mode 100644 scripts/thelordofporn/result/actresses.json delete mode 100644 scripts/thelordofporn/result/top_pornstars_list.csv delete mode 100644 scripts/thelordofporn/result/top_scenes_list.csv delete mode 100644 scripts/thelordofporn/tools.py delete mode 100644 scripts/thelordofporn/top_scenes.py delete mode 100644 scripts/u9a9/get_u9a9.py delete mode 100644 scripts/vixen_group/blacked-actress.py delete mode 100644 scripts/vixen_group/blacked-format.py delete mode 100644 scripts/vixen_group/formatted/blacked-list.txt delete mode 100644 scripts/vixen_group/formatted/tushy-list.txt delete mode 100644 scripts/vixen_group/history/actress.txt delete mode 100644 scripts/vixen_group/history/blacked-actress.txt delete mode 100644 scripts/vixen_group/history/blacked-list.txt delete mode 100644 scripts/vixen_group/history/prompt.txt delete mode 100644 scripts/vixen_group/history/tushy-actress.txt delete mode 100644 scripts/vixen_group/history/tushy-list.txt delete mode 100644 scripts/vixen_group/history/tvb-actress-all.txt delete mode 100644 scripts/vixen_group/input_files/blacked-all.txt delete mode 100644 scripts/vixen_group/input_files/tushy-raw.txt delete mode 100644 scripts/vixen_group/input_files/vixen-all.txt delete mode 100644 scripts/vixen_group/result/blacked-actress.txt delete mode 100644 scripts/vixen_group/result/tushy-actress.txt delete mode 100644 scripts/vixen_group/result/tvb-actress-all.txt delete mode 100644 scripts/vixen_group/result/vixen-actress.txt delete mode 100644 scripts/vixen_group/tushy-actress.py delete mode 100644 scripts/vixen_group/tushy-format.py delete mode 100644 scripts/vixen_group/tvb-actress.py delete mode 100644 scripts/vixen_group/vixen-actress.py rename {stockapp/shell => shell}/mysql-ddl.txt (100%) rename {stockapp/shell => shell}/stat.sql (100%) rename stockapp/schema.sql => sqlite_schema.sql (100%) rename {stockapp/src => src}/bak_stat_growth_rate.py (100%) rename {stockapp/src => src}/config.py (100%) rename {stockapp/src => src}/crawling/__init__.py (100%) rename {stockapp/src => src}/crawling/fund_etf_em.py (100%) rename {stockapp/src => src}/crawling/stock_cpbd.py (100%) rename {stockapp/src => src}/crawling/stock_dzjy_em.py (100%) rename {stockapp/src => src}/crawling/stock_fhps_em.py (100%) rename {stockapp/src => src}/crawling/stock_fund_em.py (100%) rename {stockapp/src => src}/crawling/stock_hist_em.py (100%) rename {stockapp/src => src}/crawling/stock_lhb_em.py (100%) rename {stockapp/src => src}/crawling/stock_lhb_sina.py (100%) rename {stockapp/src => src}/crawling/stock_selection.py (100%) rename {stockapp/src => src}/crawling/trade_date_hist.py (100%) rename {stockapp/src => src}/cursor/his_kline_em_codes.txt (100%) rename {stockapp/src => src}/cursor/his_kline_em_done_codes.txt (100%) rename {stockapp/src => src}/get_futu_rehb.py (100%) rename {stockapp/src => src}/get_his_kline_em.py (100%) rename {stockapp/src => src}/get_hs300_his_kline.py (100%) rename {stockapp/src => src}/get_market_snapshot.py (100%) rename {stockapp/src => src}/get_plat_list.py (100%) rename {stockapp/src => src}/get_sp500_his_kline.py (100%) rename {stockapp/src => src}/sample/get_futu_curr_kline.py (100%) rename {stockapp/src => src}/sample/get_futu_snapshot.py (100%) rename {stockapp/src => src}/sample/get_history_kline.py (100%) rename {stockapp/src => src}/sample/get_personal_list.py (100%) rename {stockapp/src => src}/sample/get_plat_list.py (100%) rename {stockapp/src => src}/sample/get_plat_stock.py (100%) rename {stockapp/src => src}/sample/get_rehab.py (100%) rename {stockapp/src => src}/sample/get_yh_kline.py (100%) rename {stockapp/src => src}/sample/get_yh_kline_download.py (100%) rename {stockapp/src => src}/stat_adjust_kline.py (100%) rename {stockapp/src => src}/stat_growth.py (100%) rename {stockapp/src => src}/stat_growth_em.py (100%) rename {stockapp/src => src}/stat_sp500_adjust_kline.py (100%) rename {stockapp/src => src}/stat_yield_rate.py (100%) rename {stockapp/src => src}/test_hs300_quant.py (100%) rename {stockapp/src => src}/test_quant.py (100%) rename {tushare-stock => tushare}/Dockerfile (100%) rename {tushare-stock => tushare}/db/ddl.sql (100%) rename {tushare-stock => tushare}/db/sql-query.sh (100%) rename {tushare-stock => tushare}/docker-compose.yml (100%) rename {tushare-stock => tushare}/kechuang50.txt (100%) rename {tushare-stock => tushare}/mod/00698.xlsx (100%) rename {tushare-stock => tushare}/mod/HKStock.py (100%) rename {tushare-stock => tushare}/mod/Template.xlsx (100%) rename {tushare-stock => tushare}/mod/main.py (100%) rename {tushare-stock => tushare}/readme.txt (100%) rename {tushare-stock => tushare}/requirements.txt (100%) rename {tushare-stock => tushare}/src/balance.log (100%) rename {tushare-stock => tushare}/src/cashflow.log (100%) rename {tushare-stock => tushare}/src/config.py (100%) rename {tushare-stock => tushare}/src/db.sql (100%) rename {tushare-stock => tushare}/src/income.log (100%) rename {tushare-stock => tushare}/src/indicator.log (100%) rename {tushare-stock => tushare}/src/job_ods_hs_base_ipo.py (100%) rename {tushare-stock => tushare}/src/job_ods_hs_base_list.py (100%) rename {tushare-stock => tushare}/src/job_ods_hs_daily_basic.py (100%) rename {tushare-stock => tushare}/src/job_ods_hs_finance_balance.py (100%) rename {tushare-stock => tushare}/src/job_ods_hs_finance_cashflow.py (100%) rename {tushare-stock => tushare}/src/job_ods_hs_finance_income.py (100%) rename {tushare-stock => tushare}/src/job_ods_hs_finance_indicator.py (100%) rename {tushare-stock => tushare}/src/job_ods_hs_indicator_list.py (100%) rename {tushare-stock => tushare}/src/test.py (100%) rename {tushare-stock => tushare}/港股接口(eastmoney).txt (100%) diff --git a/gitignore b/gitignore index a59867f..e51743b 100644 --- a/gitignore +++ b/gitignore @@ -1,29 +1,11 @@ # 忽略 log 目录 log/ -scripts/aabook/log/ -scripts/aabook/local/ -scripts/aabook/data/ -scripts/u9a9/torrents/ -scripts/u9a9/log/ -scripts/javdb/log/ -scripts/javhd/result/tmp/ -scripts/javhd/log/ -scripts/iafd/data/tmp/ -scripts/iafd/result/tmp/ -scripts/iafd/result/bak/ -scripts/iafd/result/performers/ -scripts/iafd/result/movies/ -scripts/iafd/log/ -scripts/thelordofporn/log/ -scripts/vixen_group/log/ -scripts/pornhub/log/ -stockapp/data/ -stockapp/log/ -stockapp/result/ -stockapp/reports_em/json_data/ -stockapp/reports_em/pdfs/ -stockapp/reports_em/raw/ +data/ +result/ +reports_em/json_data/ +reports_em/pdfs/ +reports_em/raw/ # 忽略 Python 编译文件 *.pyc diff --git a/stockapp/input/index/000300cons.csv b/input/index/000300cons.csv similarity index 100% rename from stockapp/input/index/000300cons.csv rename to input/index/000300cons.csv diff --git a/stockapp/input/index/000510cons.csv b/input/index/000510cons.csv similarity index 100% rename from stockapp/input/index/000510cons.csv rename to input/index/000510cons.csv diff --git a/stockapp/input/index/000685cons.csv b/input/index/000685cons.csv similarity index 100% rename from stockapp/input/index/000685cons.csv rename to input/index/000685cons.csv diff --git a/stockapp/input/index/930050cons.csv b/input/index/930050cons.csv similarity index 100% rename from stockapp/input/index/930050cons.csv rename to input/index/930050cons.csv diff --git a/stockapp/input/index/931643cons.csv b/input/index/931643cons.csv similarity index 100% rename from stockapp/input/index/931643cons.csv rename to input/index/931643cons.csv diff --git a/stockapp/input/index/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv b/input/index/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv similarity index 100% rename from stockapp/input/index/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv rename to input/index/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv diff --git a/stockapp/reports_em/config.py b/reports_em/config.py similarity index 100% rename from stockapp/reports_em/config.py rename to reports_em/config.py diff --git a/stockapp/reports_em/deploy.sh b/reports_em/deploy.sh similarity index 100% rename from stockapp/reports_em/deploy.sh rename to reports_em/deploy.sh diff --git a/stockapp/reports_em/em_reports.py b/reports_em/em_reports.py similarity index 100% rename from stockapp/reports_em/em_reports.py rename to reports_em/em_reports.py diff --git a/stockapp/reports_em/fetch.py b/reports_em/fetch.py similarity index 100% rename from stockapp/reports_em/fetch.py rename to reports_em/fetch.py diff --git a/stockapp/reports_em/sqlite_utils.py b/reports_em/sqlite_utils.py similarity index 100% rename from stockapp/reports_em/sqlite_utils.py rename to reports_em/sqlite_utils.py diff --git a/stockapp/reports_em/utils.py b/reports_em/utils.py similarity index 100% rename from stockapp/reports_em/utils.py rename to reports_em/utils.py diff --git a/stockapp/result/yield_results.csv b/result/yield_results.csv similarity index 100% rename from stockapp/result/yield_results.csv rename to result/yield_results.csv diff --git a/scripts/aabook/aabook_fetch.py b/scripts/aabook/aabook_fetch.py deleted file mode 100644 index 96643c5..0000000 --- a/scripts/aabook/aabook_fetch.py +++ /dev/null @@ -1,471 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import os -import sys -import random -import time -import re -import logging -from datetime import datetime -from datetime import date -import config # 日志配置 -from down_list import novel_map - - -# 日志 -config.setup_logging() - -# 配置基础URL和输出文件 -base_url = 'https://aabook.xyz' -list_url_wordcount = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount' -list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update' -curr_novel_pages = 0 - -meta_dir = 'meta' - -list_file = f'{meta_dir}/list.txt' -details_file = f'{meta_dir}/details.txt' -down_list_file = f'{meta_dir}/down_list.txt' - -# User-Agent 列表 -user_agents = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67", - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0", - "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36" -] -# 定义获取页面内容的函数,带重试机制 -def get_page_content(url, max_retries=100, sleep_time=5, default_timeout=10): - retries = 0 - # 随机选择一个 User-Agent - headers = { - 'User-Agent': random.choice(user_agents) - } - - while retries < max_retries: - try: - response = requests.get(url, headers=headers, timeout=default_timeout, stream=True) - response.raise_for_status() - return response.text # 请求成功,返回内容 - except requests.RequestException as e: - retries += 1 - logging.info(f"Warn fetching page {url}: {e}. Retrying {retries}/{max_retries}...") - if retries >= max_retries: - logging.error(f"Failed to fetch page {url} after {max_retries} retries.") - return None - time.sleep(sleep_time) # 休眠指定的时间,然后重试 - - -# 获取排行列表 -def get_list(write_list_file = list_file, list_url = list_url_wordcount, start_date = '2000-01-01', order_by_date = False): - page_num = 1 - start_time = datetime.strptime(f'{start_date} 00:00:00', "%Y-%m-%d %H:%M:%S") - with open(write_list_file, 'w', encoding='utf-8') as f: - while True: - # 发起请求 - list_url = list_url.format(page_num) - logging.info(f"Fetching page [{page_num}] {list_url}") - - content = get_page_content(list_url) - soup = BeautifulSoup(content, 'html.parser') - - # 查找书籍列表 - list_main = soup.find('div', class_='list_main') - if not list_main: - logging.info("No list_main Found. retry...") - continue - - tbody = list_main.find('tbody') - if not tbody: - logging.info("No tbody found. retry...") - continue - - # 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期) - for tr in tbody.find_all('tr'): - tds = tr.find_all('td') - if len(tds) < 6: - logging.info("Invalid tr format.") - continue - ranking = tds[0].text.strip() - category = tds[1].text.strip() - book_link_tag = tds[2].find('a') - book_name = book_link_tag.text.strip() - book_link = base_url + '/' + book_link_tag['href'] - author = tds[3].text.strip() - monthly_tickets = tds[4].text.strip() - update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期) - - # 检查更新 - if order_by_date : - up_time = datetime.strptime(update_time, "%Y-%m-%d %H:%M:%S") - if start_time > up_time: - return - - # 写入 aabook_list.txt - # 排名 分类 书名 作者 月票 字数(更新日期) 书本链接 - f.write(f"{ranking}\t{category}\t{book_name}\t{author}\t{monthly_tickets}\t{update_time}\t{book_link}\n") - f.flush() - - # 查找下一页链接 - next_page_tag = soup.find('a', title='下一页') - if next_page_tag: - list_url = base_url + next_page_tag['href'] - page_num += 1 - else: - logging.info("No next page, stopping.") - break - - time.sleep(3) - #break ## for test - -# 拉取详情,并校验 -def fetch_detail_and_check(url, book_name): - while True: - contenxt = get_page_content(url) - soup = BeautifulSoup(contenxt, 'html.parser') - - # 解析书籍详细信息 - book_info_tag = soup.find('li', class_='zuopinxinxi') - if not book_info_tag: - logging.info(f"No details found for {book_name}, retry...") - continue - - book_info_lis = book_info_tag.find_all('li') - if len(book_info_lis) < 4: - logging.info(f"invalid book info. {book_name}. retry...") - continue - - return contenxt - -# 获取每本书的详情 -def get_detail(write_list_file = list_file, wirte_details_file = details_file): - # 读取已完成详细信息的书籍链接 - if os.path.exists(wirte_details_file): - with open(wirte_details_file, 'r', encoding='utf-8') as f: - completed_links = set(line.split('\t')[4] for line in f.readlines()) - else: - completed_links = set() - - with open(write_list_file, 'r', encoding='utf-8') as f_list, open(wirte_details_file, 'a', encoding='utf-8') as f_details: - for line in f_list: - fields = line.strip().split('\t') - if len(fields) < 7: - continue - book_link = fields[6] - book_name = fields[2] - - if book_link in completed_links: - logging.info(f"Skipping {book_name} {book_link}, already processed.") - continue - - # 访问书籍详细页 - logging.info(f"Fetching details for {book_name} {book_link}") - #contenxt = get_page_content(book_link) - contenxt = fetch_detail_and_check(book_link, book_name) - soup = BeautifulSoup(contenxt, 'html.parser') - - # 解析书籍详细信息 - book_info_tag = soup.find('li', class_='zuopinxinxi') - if not book_info_tag: - logging.info(f"No details found for {book_name}, skipping.") - continue - - book_info_lis = book_info_tag.find_all('li') - if len(book_info_lis) < 4: - logging.info(f"invalid book info. {book_name}") - continue - book_category = book_info_lis[0].find('span').text.strip() - book_status = book_info_lis[1].find('span').text.strip() - total_word_count = book_info_lis[2].find('span').text.strip() - total_clicks = book_info_lis[3].find('span').text.strip() - # 去掉后面的汉字,只要数字 - total_word_count = int(re.search(r'\d+', total_word_count).group()) - - # 读取创建时间 - creation_time_tag = soup.find('li', class_='update_time') - creation_time = creation_time_tag.text.strip() if creation_time_tag else 'N/A' - - # 获取起始页链接和编号 - start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a') - start_page_link = base_url + '/' + start_page_tag['href'] - start_page_number = start_page_link.split('-')[-1].replace('.html', '') - - # 写入 aabook_details.txt - # 排名 类别 书名 作者 书本链接 首页链接 开始链接编码 状态 总字数 总点击 总字数 创建时间 - f_details.write(f"{fields[0]}\t{book_category}\t{fields[2]}\t{fields[3]}\t{book_link}\t" - f"{start_page_link}\t{start_page_number}\t{book_status}\t{total_word_count}\t" - f"{total_clicks}\t{fields[5]}\t{creation_time}\n") - f_details.flush() - - time.sleep(5) - - -# 解析内容中的水印部分 -def clean_watermarks(html): - """ - 过滤掉带有 class 属性的水印标签及其内部内容,保留其他标签结构。 - """ - # 使用正则表达式匹配并移除任何带有 class 属性的 HTML 标签及其内容 - cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?]+>', '', html, flags=re.DOTALL) - return cleaned_html - -def process_paragraph(paragraph): - # 获取完整的 HTML 结构,而不是 get_text() - paragraph_html = str(paragraph) - - # 移除水印标签 - cleaned_html = clean_watermarks(paragraph_html) - - # 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本 - soup = BeautifulSoup(cleaned_html, 'html.parser') - cleaned_text = soup.get_text().strip() - - return cleaned_text - -# 从 script 标签中提取 content_url -def extract_content_url(soup, base_url, chapid): - # 找到所有