authorship: New (experimental) mode.

wwood · wwood · commit 94449e857a5a · 2024-07-15T07:52:46.000+10:00
diff --git a/bin/kingfisher b/bin/kingfisher
@@ -257,6 +257,19 @@ def main():
         action='store_true',
     )
 
+    authorship_description = 'Find publication / authorship of SRA accessions'
+    authorship_parser = bird_argparser.new_subparser('authorship', authorship_description)
+    authorship_parser.add_argument(
+        '-r','--run-identifiers','--run_identifiers',
+        help='Run number to download/extract e.g. ERR1914274',
+        nargs='+',
+    )
+    # list
+    authorship_parser.add_argument(
+        '--run-identifiers-list','--run_identifiers_list','--run-accession-list','--run_accession_list','--run-identifiers-list','--run_identifiers_list',
+        help='Text file containing a newline-separated list of run identifiers i.e. a 1 column CSV file.',
+    )
+
     args = bird_argparser.parse_the_args()
 
     logging.info("Kingfisher v{}".format(kingfisher.__version__))
@@ -312,6 +325,11 @@ def main():
             output_format = args.output_format,
             all_columns = args.all_columns,
         )
+    elif args.subparser_name == 'authorship':
+        kingfisher.authorship(
+            run_identifiers = args.run_identifiers,
+            run_identifiers_file = args.run_identifiers_list,
+        )
     else:
         raise Exception("Programming error")
 
diff --git a/kingfisher/__init__.py b/kingfisher/__init__.py
@@ -785,4 +785,107 @@ def maybe_skip_or_force(path, output_files, force):
         else:
             raise Exception("Programming error")
 
-    return skip_download_and_extraction, output_files
+    return skip_download_and_extraction, output_files
+
+def authorship(**kwargs):
+    '''Try to attribute authorship / publications of SRA runs
+    '''
+    run_identifiers = kwargs.pop('run_identifiers')
+    run_identifiers_file = kwargs.pop('run_identifiers_file')
+
+    num_inputs = 0
+    if run_identifiers is not None: num_inputs += 1
+    if run_identifiers_file is not None: num_inputs += 1
+    if num_inputs != 1:
+        raise Exception("Must specify exactly one input type: --run-identifiers or --run-identifiers-list")
+
+    if run_identifiers_file is not None:
+        with open(run_identifiers_file) as f:
+            run_identifiers = list([r.strip() for r in f.readlines()])
+
+    logging.info("Finding associated authorship / publications for {} run(s)".format(len(run_identifiers)))  
+
+    # SRR7051058 is a good example of a run with GOLD authorship info
+
+    final_result = []
+
+    for run in run_identifiers:
+        logging.debug("Looking up authorship for run {}".format(run))
+
+        # ERR1914274 has a pubmed ID associated
+        # <STUDY_LINKS>
+        # <STUDY_LINK>
+        # <XREF_LINK>
+        # <DB>PUBMED</DB>
+        # <ID>29669589</ID>
+
+        # Get the metadata for the run
+        metadata = SraMetadata().efetch_sra_from_accessions([run])
+        # TODO: Do a single esearch and don't assume a result returned
+        m = metadata.iloc[0,:].to_dict()
+        # TODO: Account for multiple IDs in the same DB - not sure of an example tho
+        
+        to_print = {
+            'Run': run,
+        }
+        if 'study_links' in m:
+            study_links_json = m['study_links']
+            study_links = json.loads(study_links_json)
+            for link in study_links:
+                if 'db' in link:
+                    db = link['db']
+                    del link['db']
+                elif 'label' in link:
+                    db = link['label']
+                    del link['label']
+                else:
+                    if 'Other study links in list' not in to_print:
+                        to_print['Other study links in list'] = []
+                    to_print['Other study links in list'].append(link)
+
+                if db == 'pubmed':
+                    to_print['PubMed ID'] = link['id']
+                elif db == 'GOLD':
+                    to_print['GOLD ID'] = link['url']
+                else:
+                    if 'Other study links' not in to_print:
+                        to_print['Other study links'] = {}
+                    content_name = list(link.keys())[0]
+                    to_print['Other study links'][db] = link[content_name]
+
+        # Search PubMed for a title the same as the project name
+        # e.g. Characterisation of a sponge microbiome using an integrative genome-centric approach
+        # SRR9841429
+        study_title = m['study_title']
+        logging.debug("Searching PubMed for title '{}'".format(study_title))
+        pubmeds_from_title = SraMetadata().fetch_pubmed_ids_from_term(study_title)
+        if pubmeds_from_title:
+            to_print['PubMed IDs from title'] = ','.join(pubmeds_from_title)
+
+        logging.debug("Searching EuropePMC for title '{}'".format(study_title))
+        # TODO: The search for 'Characterisation of a sponge microbiome using an
+        # integrative genome-centric approach' gives poor results - better at
+        # PubMed. However, searching for 'sponge microbiome using an integrative
+        # genome-centric approach' does work. So maybe need to filter out common
+        # words?
+        citations_from_europe_pmc_title = SraMetadata().fetch_citations_from_query_title(study_title)
+        # TODO: Account for papers without a DOI?
+        dois = [c['doi'] for c in citations_from_europe_pmc_title]
+        if len(dois) > 0:
+            to_print['DOIs from EuropePMC title search'] = ','.join(dois)
+        
+        final_result.append(to_print)
+
+        # Search by bioproject accession e.g. for PRJEB22302 / ERR2108709
+        bioproject = m['bioproject']
+        logging.debug("Searching EuropePMC for bioproject accession '{}'".format(bioproject))
+        citations_from_europe_pmc_bioproject = SraMetadata().fetch_citations_from_query_bioproject(bioproject)
+        dois = [c['doi'] for c in citations_from_europe_pmc_bioproject]
+        if len(dois) > 0:
+            to_print['DOIs from EuropePMC bioproject search'] = ','.join(dois)
+
+
+    # Write out table as CSV
+    final = pd.DataFrame(final_result)
+    final.to_csv(sys.stdout, index=False)
+
diff --git a/kingfisher/sra_metadata.py b/kingfisher/sra_metadata.py
@@ -319,3 +319,67 @@ def efetch_sra_from_accessions(self, accessions):
         metadata.sort_values([STUDY_ACCESSION_KEY,RUN_ACCESSION_KEY], inplace=True)
 
         return metadata
+    
+    def fetch_pubmed_ids_from_term(self, term):
+        retmax = 10000
+        res = requests.get(
+            url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
+            params=self.add_api_key({
+                "db": "pubmed",
+                "term": term,
+                "tool": "kingfisher",
+                "email": "kingfisher@github.com",
+                "retmax": retmax,
+                "usehistory": "y",
+                }),
+            )
+        if not res.ok:
+            raise Exception("HTTP Failure when requesting search from term: {}: {}".format(res, res.text))
+        root = ET.fromstring(res.text)
+        logging.debug("Root of response: {}".format(ET.tostring(root)))
+        pubmed_ids = list([c.text for c in root.find('IdList')])
+        if len(pubmed_ids) == retmax:
+            logging.warning("Unexpectedly found the maximum number of results for this query, possibly some results will be missing")
+        return pubmed_ids
+
+    def fetch_citations_from_query_title(self, title):
+        # https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=Genome-centric%20view%20of%20carbon%20processing%20in%20thawing%20permafrost&format=json
+
+        # Search for the title using the ENA rest API. Found it to be superior to the NCBI esearch e.g. the query 'Metagenomics of Urban Sewage Identifies an Extensively Shared Antibiotic Resistome in China' hits on the PubMed website, but not in the NCBI esearch - unsure why. Worked out of the box with the ENA rest API.
+
+        res = requests.get(
+            url="https://www.ebi.ac.uk/europepmc/webservices/rest/search",
+            params={
+                "query": title,
+                "format": "json",
+                },
+            )
+        if not res.ok:
+            raise Exception("HTTP Failure when requesting search from term: {}: {}".format(res, res.text))
+        root = res.json()
+        logging.debug("Root of response: {}".format(root))
+
+        # Return only those that have an exact title match
+        citations = []
+        for result in root['resultList']['result']:
+            logging.debug("Title: {}".format(result['title']))
+            if result['title'].lower() == title.lower() or result['title'].lower() == title.lower() + '.':
+                citations.append(result)
+        return citations
+
+    def fetch_citations_from_query_bioproject(self, bioproject):
+
+        res = requests.get(
+            url="https://www.ebi.ac.uk/europepmc/webservices/rest/search",
+            params={
+                "query": bioproject,
+                "format": "json",
+                },
+            )
+        if not res.ok:
+            raise Exception("HTTP Failure when requesting search from term: {}: {}".format(res, res.text))
+        root = res.json()
+        logging.debug("Root of response: {}".format(root))
+
+        # Return all hits
+        return root['resultList']['result']