Skip to content

Commit 94449e8

Browse files
committed
authorship: New (experimental) mode.
1 parent 234a502 commit 94449e8

File tree

3 files changed

+186
-1
lines changed

3 files changed

+186
-1
lines changed

bin/kingfisher

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,19 @@ def main():
257257
action='store_true',
258258
)
259259

260+
authorship_description = 'Find publication / authorship of SRA accessions'
261+
authorship_parser = bird_argparser.new_subparser('authorship', authorship_description)
262+
authorship_parser.add_argument(
263+
'-r','--run-identifiers','--run_identifiers',
264+
help='Run number to download/extract e.g. ERR1914274',
265+
nargs='+',
266+
)
267+
# list
268+
authorship_parser.add_argument(
269+
'--run-identifiers-list','--run_identifiers_list','--run-accession-list','--run_accession_list','--run-identifiers-list','--run_identifiers_list',
270+
help='Text file containing a newline-separated list of run identifiers i.e. a 1 column CSV file.',
271+
)
272+
260273
args = bird_argparser.parse_the_args()
261274

262275
logging.info("Kingfisher v{}".format(kingfisher.__version__))
@@ -312,6 +325,11 @@ def main():
312325
output_format = args.output_format,
313326
all_columns = args.all_columns,
314327
)
328+
elif args.subparser_name == 'authorship':
329+
kingfisher.authorship(
330+
run_identifiers = args.run_identifiers,
331+
run_identifiers_file = args.run_identifiers_list,
332+
)
315333
else:
316334
raise Exception("Programming error")
317335

kingfisher/__init__.py

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -785,4 +785,107 @@ def maybe_skip_or_force(path, output_files, force):
785785
else:
786786
raise Exception("Programming error")
787787

788-
return skip_download_and_extraction, output_files
788+
return skip_download_and_extraction, output_files
789+
790+
def authorship(**kwargs):
791+
'''Try to attribute authorship / publications of SRA runs
792+
'''
793+
run_identifiers = kwargs.pop('run_identifiers')
794+
run_identifiers_file = kwargs.pop('run_identifiers_file')
795+
796+
num_inputs = 0
797+
if run_identifiers is not None: num_inputs += 1
798+
if run_identifiers_file is not None: num_inputs += 1
799+
if num_inputs != 1:
800+
raise Exception("Must specify exactly one input type: --run-identifiers or --run-identifiers-list")
801+
802+
if run_identifiers_file is not None:
803+
with open(run_identifiers_file) as f:
804+
run_identifiers = list([r.strip() for r in f.readlines()])
805+
806+
logging.info("Finding associated authorship / publications for {} run(s)".format(len(run_identifiers)))
807+
808+
# SRR7051058 is a good example of a run with GOLD authorship info
809+
810+
final_result = []
811+
812+
for run in run_identifiers:
813+
logging.debug("Looking up authorship for run {}".format(run))
814+
815+
# ERR1914274 has a pubmed ID associated
816+
# <STUDY_LINKS>
817+
# <STUDY_LINK>
818+
# <XREF_LINK>
819+
# <DB>PUBMED</DB>
820+
# <ID>29669589</ID>
821+
822+
# Get the metadata for the run
823+
metadata = SraMetadata().efetch_sra_from_accessions([run])
824+
# TODO: Do a single esearch and don't assume a result returned
825+
m = metadata.iloc[0,:].to_dict()
826+
# TODO: Account for multiple IDs in the same DB - not sure of an example tho
827+
828+
to_print = {
829+
'Run': run,
830+
}
831+
if 'study_links' in m:
832+
study_links_json = m['study_links']
833+
study_links = json.loads(study_links_json)
834+
for link in study_links:
835+
if 'db' in link:
836+
db = link['db']
837+
del link['db']
838+
elif 'label' in link:
839+
db = link['label']
840+
del link['label']
841+
else:
842+
if 'Other study links in list' not in to_print:
843+
to_print['Other study links in list'] = []
844+
to_print['Other study links in list'].append(link)
845+
846+
if db == 'pubmed':
847+
to_print['PubMed ID'] = link['id']
848+
elif db == 'GOLD':
849+
to_print['GOLD ID'] = link['url']
850+
else:
851+
if 'Other study links' not in to_print:
852+
to_print['Other study links'] = {}
853+
content_name = list(link.keys())[0]
854+
to_print['Other study links'][db] = link[content_name]
855+
856+
# Search PubMed for a title the same as the project name
857+
# e.g. Characterisation of a sponge microbiome using an integrative genome-centric approach
858+
# SRR9841429
859+
study_title = m['study_title']
860+
logging.debug("Searching PubMed for title '{}'".format(study_title))
861+
pubmeds_from_title = SraMetadata().fetch_pubmed_ids_from_term(study_title)
862+
if pubmeds_from_title:
863+
to_print['PubMed IDs from title'] = ','.join(pubmeds_from_title)
864+
865+
logging.debug("Searching EuropePMC for title '{}'".format(study_title))
866+
# TODO: The search for 'Characterisation of a sponge microbiome using an
867+
# integrative genome-centric approach' gives poor results - better at
868+
# PubMed. However, searching for 'sponge microbiome using an integrative
869+
# genome-centric approach' does work. So maybe need to filter out common
870+
# words?
871+
citations_from_europe_pmc_title = SraMetadata().fetch_citations_from_query_title(study_title)
872+
# TODO: Account for papers without a DOI?
873+
dois = [c['doi'] for c in citations_from_europe_pmc_title]
874+
if len(dois) > 0:
875+
to_print['DOIs from EuropePMC title search'] = ','.join(dois)
876+
877+
final_result.append(to_print)
878+
879+
# Search by bioproject accession e.g. for PRJEB22302 / ERR2108709
880+
bioproject = m['bioproject']
881+
logging.debug("Searching EuropePMC for bioproject accession '{}'".format(bioproject))
882+
citations_from_europe_pmc_bioproject = SraMetadata().fetch_citations_from_query_bioproject(bioproject)
883+
dois = [c['doi'] for c in citations_from_europe_pmc_bioproject]
884+
if len(dois) > 0:
885+
to_print['DOIs from EuropePMC bioproject search'] = ','.join(dois)
886+
887+
888+
# Write out table as CSV
889+
final = pd.DataFrame(final_result)
890+
final.to_csv(sys.stdout, index=False)
891+

kingfisher/sra_metadata.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,3 +319,67 @@ def efetch_sra_from_accessions(self, accessions):
319319
metadata.sort_values([STUDY_ACCESSION_KEY,RUN_ACCESSION_KEY], inplace=True)
320320

321321
return metadata
322+
323+
def fetch_pubmed_ids_from_term(self, term):
324+
retmax = 10000
325+
res = requests.get(
326+
url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
327+
params=self.add_api_key({
328+
"db": "pubmed",
329+
"term": term,
330+
"tool": "kingfisher",
331+
"email": "[email protected]",
332+
"retmax": retmax,
333+
"usehistory": "y",
334+
}),
335+
)
336+
if not res.ok:
337+
raise Exception("HTTP Failure when requesting search from term: {}: {}".format(res, res.text))
338+
root = ET.fromstring(res.text)
339+
logging.debug("Root of response: {}".format(ET.tostring(root)))
340+
pubmed_ids = list([c.text for c in root.find('IdList')])
341+
if len(pubmed_ids) == retmax:
342+
logging.warning("Unexpectedly found the maximum number of results for this query, possibly some results will be missing")
343+
return pubmed_ids
344+
345+
def fetch_citations_from_query_title(self, title):
346+
# https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=Genome-centric%20view%20of%20carbon%20processing%20in%20thawing%20permafrost&format=json
347+
348+
# Search for the title using the ENA rest API. Found it to be superior to the NCBI esearch e.g. the query 'Metagenomics of Urban Sewage Identifies an Extensively Shared Antibiotic Resistome in China' hits on the PubMed website, but not in the NCBI esearch - unsure why. Worked out of the box with the ENA rest API.
349+
350+
res = requests.get(
351+
url="https://www.ebi.ac.uk/europepmc/webservices/rest/search",
352+
params={
353+
"query": title,
354+
"format": "json",
355+
},
356+
)
357+
if not res.ok:
358+
raise Exception("HTTP Failure when requesting search from term: {}: {}".format(res, res.text))
359+
root = res.json()
360+
logging.debug("Root of response: {}".format(root))
361+
362+
# Return only those that have an exact title match
363+
citations = []
364+
for result in root['resultList']['result']:
365+
logging.debug("Title: {}".format(result['title']))
366+
if result['title'].lower() == title.lower() or result['title'].lower() == title.lower() + '.':
367+
citations.append(result)
368+
return citations
369+
370+
def fetch_citations_from_query_bioproject(self, bioproject):
371+
372+
res = requests.get(
373+
url="https://www.ebi.ac.uk/europepmc/webservices/rest/search",
374+
params={
375+
"query": bioproject,
376+
"format": "json",
377+
},
378+
)
379+
if not res.ok:
380+
raise Exception("HTTP Failure when requesting search from term: {}: {}".format(res, res.text))
381+
root = res.json()
382+
logging.debug("Root of response: {}".format(root))
383+
384+
# Return all hits
385+
return root['resultList']['result']

0 commit comments

Comments
 (0)