diff --git a/.gitignore b/.gitignore index 3cd6f3a..a724648 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ log/ logs/ *.log +# 忽略数据文件 +tools/data/* + # 忽略编译后的二进制文件 bin/* obj/ diff --git a/gitignore b/gitignore index 3cd6f3a..a724648 100644 --- a/gitignore +++ b/gitignore @@ -3,6 +3,9 @@ log/ logs/ *.log +# 忽略数据文件 +tools/data/* + # 忽略编译后的二进制文件 bin/* obj/ diff --git a/tools/puzzle.py b/tools/puzzle.py new file mode 100644 index 0000000..3f9ca31 --- /dev/null +++ b/tools/puzzle.py @@ -0,0 +1,157 @@ +import os +import json +import random +import time +import logging +from pathlib import Path +from openai import AzureOpenAI + +endpoint = "https://grammar.openai.azure.com/" +model_name = "gpt-4o" +deployment = "gpt4" + +subscription_key = "8b68c235b737488ab9a99983a14f8cca" +api_version = "2024-12-01-preview" + +client = AzureOpenAI( + api_version=api_version, + azure_endpoint=endpoint, + api_key=subscription_key, +) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + handlers=[logging.StreamHandler()] +) + +BASE_DIR = './data' +WORDS_DIR = f"{BASE_DIR}/EOWL-v1.1.2/LF Delimited Format" +RESULT_DIR = f"{BASE_DIR}/result" +os.makedirs(RESULT_DIR, exist_ok=True) +TEMP_FILE = f"{BASE_DIR}/temp_words.txt" + +batch_words_size = 100 + +def find_words_files(folder): + txt_files = [] + for f in Path(folder).glob("*.txt"): + if "Words" in f.name: + txt_files.append(f) + return txt_files + + +def collect_words(files): + words_set = set() + for file in files: + with open(file, 'r', encoding='utf-8') as f: + for line in f: + word = line.strip() + if len(word) >= 3: + words_set.add(word) + return list(words_set) + + +def write_temp(words): + with open(TEMP_FILE, 'w', encoding='utf-8') as f: + for word in words: + f.write(word + '\n') + + +def read_batches(batch_size=batch_words_size): + with open(TEMP_FILE, 'r', encoding='utf-8') as f: + words = [line.strip() for line in f if line.strip()] + for i in range(0, len(words), batch_size): + yield words[i:i+batch_size] + +'''Please respond with pure JSON only, without any formatting or explanations.''' +def build_prompt(words): + word_list = ", ".join(words) + prompt = f""" +Please analyze the following list of English words and do the following: + +1. Classify each word into a theme (like Animals, Plants, Materials, Body Parts, Clothes & Accessories, Food & Drinks, Places, Transportation, Sports, Colors, Numbers, Emotions, Tools, People & Occupations, etc.). +2. Identify the part of speech of each word (verb, noun, adjective, etc.). +3. Mark the frequency of usage of each word in everyday English as High, Medium, or Low. +4. Identify words with the same word root and group them. + +Please response with pure JSON only, without any formatting or explanations. +Each object should have the keys: word, theme, part_of_speech, frequency, same_root_group. + +Here are the words: +{word_list} +""" + return prompt + + +def call_openai_with_retry(prompt, retries=3, delay=5): + for attempt in range(retries): + try: + response = client.chat.completions.create( + messages=[ + {"role": "system", "content": "You are an expert English linguist and lexicographer."}, + {"role": "user", "content": prompt} + ], + max_tokens=16000, + temperature=0.7, + top_p=1.0, + model=deployment + ) + #return response.choices[0].message.content.strip() + text = response.choices[0].message.content.strip() + # 如果还有 ```json 开头的,去掉 + if text.startswith("```json"): + text = text[7:-3].strip() + return text + except Exception as e: + logging.warning(f"OpenAI request failed (attempt {attempt+1}): {e}") + time.sleep(delay) + logging.error("OpenAI request failed after all retries.") + return None + + +def save_result(index, req, resp, is_json): + matched = True if is_json and len(req) == len(resp) else False + flag = "json" if is_json else "txt" + match_str = "matched" if matched else 'notmatch' + filename = f"{RESULT_DIR}/{str(index).zfill(5)}_{match_str}_{flag}.json" + data = { + 'req_len': len(req), + 'rsp_len': len(resp) if is_json else 0, + 'match':matched, + 'req': req, + 'rsp': resp + } + with open(filename, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + logging.info(f"Saved result to {filename}") + + +def process_folder(folder): + files = find_words_files(folder) + logging.info(f"Found {len(files)} files to process.") + words = collect_words(files) + logging.info(f"Collected {len(words)} unique words.") + write_temp(words) + + for idx, batch in enumerate(read_batches(), 1): + logging.info(f"Processing batch {idx} with {len(batch)} words") + prompt = build_prompt(batch) + resp_text = call_openai_with_retry(prompt) + + if resp_text is None: + save_result(idx, batch, "Failed to get response", False) + continue + + try: + resp_json = json.loads(resp_text) + save_result(idx, batch, resp_json, True) + except json.JSONDecodeError: + logging.warning(f"Batch {idx} response is not valid JSON.") + save_result(idx, batch, resp_text, False) + + time.sleep(2) # 每批之间暂停 + + +if __name__ == "__main__": + process_folder(WORDS_DIR) \ No newline at end of file