monarch-initiative · leokim-l · Jun 17, 2025 · May 27, 2025 · May 27, 2025 · May 28, 2025
diff --git a/analysis/PMID_to_date_extractor.py b/analysis/PMID_to_date_extractor.py
@@ -0,0 +1,79 @@
+""" This script extracts the field .metaData.externalReferences from all json files in a directory and saves them in a dictionary.
+It then uses the PubMedFetcher to fetch the corresponding PubMed article's date of publication for each reference.
+The output is a dictionary with the PubMed IDs as keys and their publication dates as values.
+
+Needs an NCBI API key to work. The key can be set in the environment variable NCBI_API_KEY or read from a file.
+"""
+
+import json
+import os
+from metapub import PubMedFetcher 
+import datetime
+from tqdm import tqdm
+import sys
+# Check if the script is being run directly
+
+
+# Path to the directory containing the JSON files as cli input
+try:
+    ppkts_dir = str(sys.argv[1])
+except IndexError:
+    ppkts_dir = '/Users/leonardo/data/ppkts_4967_polyglot/jsons'
+
+# Get the directory of the current script
+script_dir = os.path.dirname(os.path.abspath(__file__))
+
+# Set up the NCBI API key
+# Uncomment the following lines to read the API key from a file (does not work...)
+"""# Read key file ~/ncbi.key
+with open(os.path.expanduser('~/ncbi.key'), 'r') as f:
+    os.environ['NCBI_API_KEY'] = f.read().strip()
+"""
+
+# Iterate over all json files in the directory and populate a dictionary with as a key the filename and as a value the field .metaData.externalReferences
+ppkts = {}
+for filename in os.listdir(ppkts_dir):
+    if filename.endswith('.json'):
+        with open(os.path.join(ppkts_dir, filename), 'r') as f:
+            data = json.load(f)
+            # Check if the key exists in the JSON data
+            if 'metaData' in data and 'externalReferences' in data['metaData']:
+                ppkts[filename] = data['metaData']['externalReferences']
+            else:
+                print(f"Key not found in {filename}")
+
+pmid_list = []
+# Iterate over the dictionary and make a list on unique PubMed IDs
+for filename, references in ppkts.items():
+    for reference in references:
+        if 'id' in reference:  
+            pmid_list.append(reference['id'].replace("PMID:", "").strip())
+pmid_list = list(set(pmid_list))  # Remove duplicates
+# Print the number of unique PubMed IDs found
+print(f"Found {len(pmid_list)} unique PubMed IDs.")
+
+# For each item in the dictionary, take the "id" field and use it to fetch the corresponding PubMed article's date of publication
+fetcher = PubMedFetcher()
+pmid2date_dict = {}
+for pmid in tqdm(pmid_list, desc="Fetching PubMed data", unit="PMID"):
+    try:
+        article = fetcher.article_by_pmid(pmid)
+        pmid2date_dict['PMID:'+pmid] = {
+            'date': article.history['entrez']
+        }
+    except Exception as e:
+        print(f"Error fetching data for {pmid}: {e}")
+
+
+# Save the dictionary to a JSON file
+# Construct the output file path relative to the script's directory
+output_json_file = os.path.join(script_dir, "../../../leakage_experiment", "pmid2date_dict.json")
+# Ensure the output directory exists
+output_dir = os.path.dirname(output_json_file)
+os.makedirs(output_dir, exist_ok=True)
+# Write the dictionary to a JSON file
+with open(output_json_file, "w") as f:
+    json.dump(pmid2date_dict, f, indent=4, default=str)  # Use default=str to handle datetime objects
+
+print(f"Saved dictionary to {output_json_file}")
+
diff --git a/analysis/count_gpt_tokens.py b/analysis/count_gpt_tokens.py
@@ -0,0 +1,61 @@
+import tiktoken
+import os
+
+# Choose the encoding for your model (e.g., gpt-3.5-turbo, gpt-4, etc.)
+encoding = tiktoken.encoding_for_model("gpt-4o")
+
+langs = ["en", "cs", "de", "es", "fr", "it", "ja", "tr", "zh", "nl"]
+
+# For each language count and save the tokens in all files in the directory
+directory = "in_multlingual_nov24/prompts/"
+
+count_dict_input = {}
+count_dict_output = {}
+
+for lang in langs:
+    # INPUT COST
+    total_tokens = 0
+    langpath = os.path.join(directory, lang)
+    for filename in os.listdir(langpath): # For all files
+        if filename.endswith(f"_{lang}-prompt.txt"):
+            with open(os.path.join(langpath, filename), 'r', encoding='utf-8') as file:
+                text = file.read()
+                num_tokens = len(encoding.encode(text))
+                total_tokens += num_tokens
+    count_dict_input[lang] = total_tokens
+    print(f"Total input tokens for {lang}: {total_tokens}")
+
+    # OUTPUT COST
+    total_tokens = 0
+    outpath = f"out_multlingual_nov24/raw_results/multilingual/{lang}/differentials_by_file/"
+    for filename in os.listdir(outpath): 
+        if filename.endswith(f"_{lang}-prompt.txt.result"):
+            with open(os.path.join(outpath, filename), 'r', encoding='utf-8') as file:
+                text = file.read()
+                num_tokens = len(encoding.encode(text, allowed_special={'<|endoftext|>'}))
+                total_tokens += num_tokens
+    count_dict_output[lang] = total_tokens
+    print(f"Total output tokens for {lang}: {total_tokens}")
+
+
+count_dict_input['total'] = sum(count_dict_input.values())
+count_dict_output['total'] = sum(count_dict_output.values())
+
+print(f"Full input token count: {count_dict_input['total']}") 
+print(f"Full output token count: {count_dict_output['total']}") 
+
+# Save the dictionaries to files
+input_file = "analysis_out/token_counts/input_token_counts.txt"
+output_file = "analysis_out/token_counts/output_token_counts.txt"
+os.makedirs(os.path.dirname(input_file), exist_ok=True)
+os.makedirs(os.path.dirname(output_file), exist_ok=True)
+with open(input_file, 'w') as f:
+    f.write("Input cost for GPT-4o $5.00 / 1M input tokens\n")
+    for lang, count in count_dict_input.items():
+        f.write(f"{lang}: {count}\n")
+print(f"Input token counts saved to {input_file}")
+with open(output_file, 'w') as f:
+    f.write("Output cost for GPT-4o $20.00 / 1M output tokens\n")
+    for lang, count in count_dict_output.items():
+        f.write(f"{lang}: {count}\n")
+print(f"Output token counts saved to {output_file}")
diff --git a/analysis/count_translated_prompts_and_copy.py b/analysis/count_translated_prompts_and_copy.py
@@ -0,0 +1,91 @@
+"""Look in the phenopacket2prompt output directory for the common phenopackets across languages and copy them to another directory.
+"""
+
+import os
+import re
+import shutil
+import tqdm
+import sys
+import json
+
+
+create_list_file = False
+copy_prompt_files = False
+copy_json_files = True
+
+try:
+    fp = sys.argv[1]
+except IndexError:
+    print("No path provided, using default path.")
+    # Default path to the phenopacket2prompt output directory
+    fp = "/Users/leonardo/IdeaProjects/phenopacket2prompt/6668prompts/"
+
+try:
+    output_file = sys.argv[2]
+except IndexError:
+    print("No output file provided, using default output file.")
+    # Default output file to save the common phenopackets
+    output_file = "final_multilingual_output/ppkts_4917set.txt"  # Specify the output file name
+
+try:
+    dst_dir = sys.argv[3]
+except IndexError:
+    # Default destination directory to copy the files
+    print("No destination directory provided, using default destination directory.")
+    dst_dir = "/Users/leonardo/data/4917_poly_ppkts"
+
+# Take as a second argument the list of languages to consider
+if len(sys.argv) > 4:
+    langs = sys.argv[4].split(",")
+else:
+    # Default languages to consider
+    langs = ["en", "ja", "es", "de", "it", "nl", "tr", "zh", "cs", "fr"]
+    print("No languages provided, using default languages: ", langs, "\nYou can provide them as a comma-separated list as the second argument.")
+
+promptfiles = {}
+for lang in langs:
+    promptfiles[lang] = []
+    for dirpath, dirnames, filenames in os.walk(fp + lang):
+        for fn in filenames:
+            fn = fn.replace("_" + lang + "-prompt.txt", "")
+            promptfiles[lang].append(fn)
+        break
+
+intersection = set()
+
+# Convert lists to sets for intersection
+promptfiles = {lang: set(files) for lang, files in promptfiles.items()}
+
+# Create an intersection set of all languages
+# Initialize the intersection with the first language's set
+if langs:
+    intersection = promptfiles[langs[0]]
+# Intersect with the sets of the other languages
+for lang in langs[1:]:
+    intersection &= promptfiles[lang]
+
+print("Common ppkts are: ", len(intersection))
+
+if create_list_file:
+    with open(output_file, "w") as f:
+        for item in intersection:
+            f.write(item + "_en-prompt.txt\n")  # Write each item followed by a newline character
+
+# Copy prompts
+if copy_prompt_files:
+    for id in tqdm.tqdm(intersection, "Copying files..."):
+        for lang in langs:
+            shutil.copy(fp + lang + "/" + id + "_" + lang + "-prompt.txt", dst_dir + lang)
+
+# Copy jsons
+if copy_json_files:
+    json_path = os.path.join(fp, "original_phenopackets")
+    for jsonfile in tqdm.tqdm(os.listdir(json_path), "Copying json files..."):
+        with open(os.path.join(json_path, jsonfile), 'r') as f:
+            data = json.load(f)
+            id = data['id']
+            id = re.sub(r'[^\w]', '_', id)
+            if id in intersection:
+                shutil.copy(os.path.join(json_path, jsonfile), os.path.join(dst_dir, "jsons", jsonfile))
+            else:
+                print(f"Skipping {jsonfile}, not in intersection.")
diff --git a/analysis/create_hf_datasets.py b/analysis/create_hf_datasets.py
@@ -0,0 +1,88 @@
+"""
+Create Hugging Face datasets from multilingual prompt files.
+
+This script reads prompt files for multiple languages, associates them with correct answers, 
+and saves them as Parquet files for each language.
+
+The script takes three command-line arguments:
+1. The directory containing the input prompt files.
+2. The directory where the output Parquet files will be saved.
+3. A comma-separated list of languages to process (optional, defaults to a predefined list).
+
+Example usage:
+python create_hf_datasets.py /path/to/input /path/to/output "en,es,fr"
+
+To upload the generated Parquet files to Hugging Face:
+1. Log in using `huggingface-cli login`.
+2. Use the `huggingface-cli upload` command:
+   huggingface-cli upload <username>/prompts_llms <output_directory_of_this_script> --repo-type=dataset
+
+For more details, refer to the Hugging Face documentation.
+"""
+
+import os
+from pathlib import Path
+import pandas as pd
+import sys
+
+# Default list of languages
+default_languages = ['en', 'cs', 'es', 'de', 'it', 'ja', 'nl', 'tr', 'zh', 'fr']
+
+
+# Parse command-line arguments
+try:
+    input_dir = Path(sys.argv[1])
+except IndexError:
+    input_dir = Path(os.getcwd()) / 'in_multlingual_nov24/prompts'
+    print('\nYou can pass the input directory as the first CLI argument!\n')
+
+try:
+    output_dir = Path(sys.argv[2])
+except IndexError:
+    output_dir = Path(os.getcwd()) / 'hf_prompts/validation'
+    print('\nYou can pass the output directory as the second CLI argument!\n')
+
+try:
+    languages_str = sys.argv[3]
+    languages = languages_str.split(',')
+except IndexError:
+    languages = default_languages
+    print('\nYou can pass a comma-separated list of languages as the third CLI argument!\n')
+
+# Ensure the output directory exists
+output_dir.mkdir(parents=True, exist_ok=True)
+
+# Read in correct answers
+correct_answer_file = input_dir / 'correct_results.tsv'
+correct_answers = pd.read_csv(correct_answer_file, sep='\t', names=['disease_name', 'disease_id', 'file_id'])
+correct_answers.set_index('file_id', inplace=True)
+
+# Process each language
+for lang in languages:
+    lang_out_dir = output_dir / lang
+    lang_out_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Created directory {lang_out_dir}")
+    lang_in_dir = input_dir / lang
+
+    rows = []
+    for file in lang_in_dir.iterdir():
+        # Extract file ID and match with correct answers
+        file_ending = "en-prompt"
+        file_id = file.stem[:-len(file_ending)] + file_ending + '.txt'
+        gold_dict = {
+            'disease_name': correct_answers.loc[file_id, 'disease_name'] if file_id in correct_answers.index else None,
+            'disease_id': correct_answers.loc[file_id, 'disease_id'] if file_id in correct_answers.index else None
+        }
+
+        # Read the prompt content
+        with open(file, 'r') as f:
+            prompt = f.read()
+
+        # Append the data to rows
+        rows.append({'id': file.stem + '.txt', 'prompt': prompt, 'gold': gold_dict})
+
+    # Save the DataFrame to a Parquet file
+    df = pd.DataFrame(rows)
+    out_file = lang_out_dir / f'{lang}_hf_prompts'
+    df.to_parquet(out_file.with_suffix('.parquet'))
+    print(f"Saved prompts to {out_file}.parquet")
diff --git a/scratch/analysis/eval_diagnose_category.py → analysis/eval_diagnose_category.py b/scratch/analysis/eval_diagnose_category.py → analysis/eval_diagnose_category.py
@@ -13,14 +13,11 @@
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-outpath = "disease_groups/"
-
+outpath = "analysis_out/disease_groups/"
 pc_cache_file = outpath + "diagnoses_hereditary_cond"
 pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)
-
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-
 def mondo_adapter() -> OboGraphInterface:
     """
     Get the adapter for the MONDO ontology.
@@ -29,22 +26,16 @@ def mondo_adapter() -> OboGraphInterface:
         Adapter: The adapter.
     """
     return get_adapter("sqlite:obo:mondo")
-
-
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-
 def mondo_mapping(term, adapter):
     mondos = []
     for m in adapter.sssom_mappings([term], source="OMIM"):
         if m.predicate_id == "skos:exactMatch":
             mondos.append(m.subject_id)
     return mondos
-
-
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-
 @cached(pc, key=lambda omim_term, disease_categories, mondo: hashkey(omim_term))
 def find_category(omim_term, disease_categories, mondo):
     if not isinstance(mondo, MappingProviderInterface):