Skip to content

Fix select extend summary #67

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
79 changes: 79 additions & 0 deletions analysis/PMID_to_date_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
""" This script extracts the field .metaData.externalReferences from all json files in a directory and saves them in a dictionary.
It then uses the PubMedFetcher to fetch the corresponding PubMed article's date of publication for each reference.
The output is a dictionary with the PubMed IDs as keys and their publication dates as values.

Needs an NCBI API key to work. The key can be set in the environment variable NCBI_API_KEY or read from a file.
"""

import json
import os
from metapub import PubMedFetcher
import datetime
from tqdm import tqdm
import sys
# Check if the script is being run directly


# Path to the directory containing the JSON files as cli input
try:
ppkts_dir = str(sys.argv[1])
except IndexError:
ppkts_dir = '/Users/leonardo/data/ppkts_4967_polyglot/jsons'

# Get the directory of the current script
script_dir = os.path.dirname(os.path.abspath(__file__))

# Set up the NCBI API key
# Uncomment the following lines to read the API key from a file (does not work...)
"""# Read key file ~/ncbi.key
with open(os.path.expanduser('~/ncbi.key'), 'r') as f:
os.environ['NCBI_API_KEY'] = f.read().strip()
"""

# Iterate over all json files in the directory and populate a dictionary with as a key the filename and as a value the field .metaData.externalReferences
ppkts = {}
for filename in os.listdir(ppkts_dir):
if filename.endswith('.json'):
with open(os.path.join(ppkts_dir, filename), 'r') as f:
data = json.load(f)
# Check if the key exists in the JSON data
if 'metaData' in data and 'externalReferences' in data['metaData']:
ppkts[filename] = data['metaData']['externalReferences']
else:
print(f"Key not found in {filename}")

pmid_list = []
# Iterate over the dictionary and make a list on unique PubMed IDs
for filename, references in ppkts.items():
for reference in references:
if 'id' in reference:
pmid_list.append(reference['id'].replace("PMID:", "").strip())
pmid_list = list(set(pmid_list)) # Remove duplicates
# Print the number of unique PubMed IDs found
print(f"Found {len(pmid_list)} unique PubMed IDs.")

# For each item in the dictionary, take the "id" field and use it to fetch the corresponding PubMed article's date of publication
fetcher = PubMedFetcher()
pmid2date_dict = {}
for pmid in tqdm(pmid_list, desc="Fetching PubMed data", unit="PMID"):
try:
article = fetcher.article_by_pmid(pmid)
pmid2date_dict['PMID:'+pmid] = {
'date': article.history['entrez']
}
except Exception as e:
print(f"Error fetching data for {pmid}: {e}")


# Save the dictionary to a JSON file
# Construct the output file path relative to the script's directory
output_json_file = os.path.join(script_dir, "../../../leakage_experiment", "pmid2date_dict.json")
# Ensure the output directory exists
output_dir = os.path.dirname(output_json_file)
os.makedirs(output_dir, exist_ok=True)
# Write the dictionary to a JSON file
with open(output_json_file, "w") as f:
json.dump(pmid2date_dict, f, indent=4, default=str) # Use default=str to handle datetime objects

print(f"Saved dictionary to {output_json_file}")

61 changes: 61 additions & 0 deletions analysis/count_gpt_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import tiktoken
import os

# Choose the encoding for your model (e.g., gpt-3.5-turbo, gpt-4, etc.)
encoding = tiktoken.encoding_for_model("gpt-4o")

langs = ["en", "cs", "de", "es", "fr", "it", "ja", "tr", "zh", "nl"]

# For each language count and save the tokens in all files in the directory
directory = "in_multlingual_nov24/prompts/"

count_dict_input = {}
count_dict_output = {}

for lang in langs:
# INPUT COST
total_tokens = 0
langpath = os.path.join(directory, lang)
for filename in os.listdir(langpath): # For all files
if filename.endswith(f"_{lang}-prompt.txt"):
with open(os.path.join(langpath, filename), 'r', encoding='utf-8') as file:
text = file.read()
num_tokens = len(encoding.encode(text))
total_tokens += num_tokens
count_dict_input[lang] = total_tokens
print(f"Total input tokens for {lang}: {total_tokens}")

# OUTPUT COST
total_tokens = 0
outpath = f"out_multlingual_nov24/raw_results/multilingual/{lang}/differentials_by_file/"
for filename in os.listdir(outpath):
if filename.endswith(f"_{lang}-prompt.txt.result"):
with open(os.path.join(outpath, filename), 'r', encoding='utf-8') as file:
text = file.read()
num_tokens = len(encoding.encode(text, allowed_special={'<|endoftext|>'}))
total_tokens += num_tokens
count_dict_output[lang] = total_tokens
print(f"Total output tokens for {lang}: {total_tokens}")


count_dict_input['total'] = sum(count_dict_input.values())
count_dict_output['total'] = sum(count_dict_output.values())

print(f"Full input token count: {count_dict_input['total']}")
print(f"Full output token count: {count_dict_output['total']}")

# Save the dictionaries to files
input_file = "analysis_out/token_counts/input_token_counts.txt"
output_file = "analysis_out/token_counts/output_token_counts.txt"
os.makedirs(os.path.dirname(input_file), exist_ok=True)
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(input_file, 'w') as f:
f.write("Input cost for GPT-4o $5.00 / 1M input tokens\n")
for lang, count in count_dict_input.items():
f.write(f"{lang}: {count}\n")
print(f"Input token counts saved to {input_file}")
with open(output_file, 'w') as f:
f.write("Output cost for GPT-4o $20.00 / 1M output tokens\n")
for lang, count in count_dict_output.items():
f.write(f"{lang}: {count}\n")
print(f"Output token counts saved to {output_file}")
91 changes: 91 additions & 0 deletions analysis/count_translated_prompts_and_copy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Look in the phenopacket2prompt output directory for the common phenopackets across languages and copy them to another directory.
"""

import os
import re
import shutil
import tqdm
import sys
import json


create_list_file = False
copy_prompt_files = False
copy_json_files = True

try:
fp = sys.argv[1]
except IndexError:
print("No path provided, using default path.")
# Default path to the phenopacket2prompt output directory
fp = "/Users/leonardo/IdeaProjects/phenopacket2prompt/6668prompts/"

try:
output_file = sys.argv[2]
except IndexError:
print("No output file provided, using default output file.")
# Default output file to save the common phenopackets
output_file = "final_multilingual_output/ppkts_4917set.txt" # Specify the output file name

try:
dst_dir = sys.argv[3]
except IndexError:
# Default destination directory to copy the files
print("No destination directory provided, using default destination directory.")
dst_dir = "/Users/leonardo/data/4917_poly_ppkts"

# Take as a second argument the list of languages to consider
if len(sys.argv) > 4:
langs = sys.argv[4].split(",")
else:
# Default languages to consider
langs = ["en", "ja", "es", "de", "it", "nl", "tr", "zh", "cs", "fr"]
print("No languages provided, using default languages: ", langs, "\nYou can provide them as a comma-separated list as the second argument.")

promptfiles = {}
for lang in langs:
promptfiles[lang] = []
for dirpath, dirnames, filenames in os.walk(fp + lang):
for fn in filenames:
fn = fn.replace("_" + lang + "-prompt.txt", "")
promptfiles[lang].append(fn)
break

intersection = set()

# Convert lists to sets for intersection
promptfiles = {lang: set(files) for lang, files in promptfiles.items()}

# Create an intersection set of all languages
# Initialize the intersection with the first language's set
if langs:
intersection = promptfiles[langs[0]]
# Intersect with the sets of the other languages
for lang in langs[1:]:
intersection &= promptfiles[lang]

print("Common ppkts are: ", len(intersection))

if create_list_file:
with open(output_file, "w") as f:
for item in intersection:
f.write(item + "_en-prompt.txt\n") # Write each item followed by a newline character

# Copy prompts
if copy_prompt_files:
for id in tqdm.tqdm(intersection, "Copying files..."):
for lang in langs:
shutil.copy(fp + lang + "/" + id + "_" + lang + "-prompt.txt", dst_dir + lang)

# Copy jsons
if copy_json_files:
json_path = os.path.join(fp, "original_phenopackets")
for jsonfile in tqdm.tqdm(os.listdir(json_path), "Copying json files..."):
with open(os.path.join(json_path, jsonfile), 'r') as f:
data = json.load(f)
id = data['id']
id = re.sub(r'[^\w]', '_', id)
if id in intersection:
shutil.copy(os.path.join(json_path, jsonfile), os.path.join(dst_dir, "jsons", jsonfile))
else:
print(f"Skipping {jsonfile}, not in intersection.")
88 changes: 88 additions & 0 deletions analysis/create_hf_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
Create Hugging Face datasets from multilingual prompt files.

This script reads prompt files for multiple languages, associates them with correct answers,
and saves them as Parquet files for each language.

The script takes three command-line arguments:
1. The directory containing the input prompt files.
2. The directory where the output Parquet files will be saved.
3. A comma-separated list of languages to process (optional, defaults to a predefined list).

Example usage:
python create_hf_datasets.py /path/to/input /path/to/output "en,es,fr"

To upload the generated Parquet files to Hugging Face:
1. Log in using `huggingface-cli login`.
2. Use the `huggingface-cli upload` command:
huggingface-cli upload <username>/prompts_llms <output_directory_of_this_script> --repo-type=dataset

For more details, refer to the Hugging Face documentation.
"""

import os
from pathlib import Path
import pandas as pd
import sys

# Default list of languages
default_languages = ['en', 'cs', 'es', 'de', 'it', 'ja', 'nl', 'tr', 'zh', 'fr']


# Parse command-line arguments
try:
input_dir = Path(sys.argv[1])
except IndexError:
input_dir = Path(os.getcwd()) / 'in_multlingual_nov24/prompts'
print('\nYou can pass the input directory as the first CLI argument!\n')

try:
output_dir = Path(sys.argv[2])
except IndexError:
output_dir = Path(os.getcwd()) / 'hf_prompts/validation'
print('\nYou can pass the output directory as the second CLI argument!\n')

try:
languages_str = sys.argv[3]
languages = languages_str.split(',')
except IndexError:
languages = default_languages
print('\nYou can pass a comma-separated list of languages as the third CLI argument!\n')

# Ensure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Read in correct answers
correct_answer_file = input_dir / 'correct_results.tsv'
correct_answers = pd.read_csv(correct_answer_file, sep='\t', names=['disease_name', 'disease_id', 'file_id'])
correct_answers.set_index('file_id', inplace=True)

# Process each language
for lang in languages:
lang_out_dir = output_dir / lang
lang_out_dir.mkdir(parents=True, exist_ok=True)
print(f"Created directory {lang_out_dir}")
lang_in_dir = input_dir / lang

rows = []
for file in lang_in_dir.iterdir():
# Extract file ID and match with correct answers
file_ending = "en-prompt"
file_id = file.stem[:-len(file_ending)] + file_ending + '.txt'
gold_dict = {
'disease_name': correct_answers.loc[file_id, 'disease_name'] if file_id in correct_answers.index else None,
'disease_id': correct_answers.loc[file_id, 'disease_id'] if file_id in correct_answers.index else None
}

# Read the prompt content
with open(file, 'r') as f:
prompt = f.read()

# Append the data to rows
rows.append({'id': file.stem + '.txt', 'prompt': prompt, 'gold': gold_dict})

# Save the DataFrame to a Parquet file
df = pd.DataFrame(rows)
out_file = lang_out_dir / f'{lang}_hf_prompts'
df.to_parquet(out_file.with_suffix('.parquet'))
print(f"Saved prompts to {out_file}.parquet")
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,11 @@
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

outpath = "disease_groups/"

outpath = "analysis_out/disease_groups/"
pc_cache_file = outpath + "diagnoses_hereditary_cond"
pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


def mondo_adapter() -> OboGraphInterface:
"""
Get the adapter for the MONDO ontology.
Expand All @@ -29,22 +26,16 @@ def mondo_adapter() -> OboGraphInterface:
Adapter: The adapter.
"""
return get_adapter("sqlite:obo:mondo")


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


def mondo_mapping(term, adapter):
mondos = []
for m in adapter.sssom_mappings([term], source="OMIM"):
if m.predicate_id == "skos:exactMatch":
mondos.append(m.subject_id)
return mondos


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


@cached(pc, key=lambda omim_term, disease_categories, mondo: hashkey(omim_term))
def find_category(omim_term, disease_categories, mondo):
if not isinstance(mondo, MappingProviderInterface):
Expand Down
Loading