@@ -785,4 +785,107 @@ def maybe_skip_or_force(path, output_files, force):
785
785
else :
786
786
raise Exception ("Programming error" )
787
787
788
- return skip_download_and_extraction , output_files
788
+ return skip_download_and_extraction , output_files
789
+
790
+ def authorship (** kwargs ):
791
+ '''Try to attribute authorship / publications of SRA runs
792
+ '''
793
+ run_identifiers = kwargs .pop ('run_identifiers' )
794
+ run_identifiers_file = kwargs .pop ('run_identifiers_file' )
795
+
796
+ num_inputs = 0
797
+ if run_identifiers is not None : num_inputs += 1
798
+ if run_identifiers_file is not None : num_inputs += 1
799
+ if num_inputs != 1 :
800
+ raise Exception ("Must specify exactly one input type: --run-identifiers or --run-identifiers-list" )
801
+
802
+ if run_identifiers_file is not None :
803
+ with open (run_identifiers_file ) as f :
804
+ run_identifiers = list ([r .strip () for r in f .readlines ()])
805
+
806
+ logging .info ("Finding associated authorship / publications for {} run(s)" .format (len (run_identifiers )))
807
+
808
+ # SRR7051058 is a good example of a run with GOLD authorship info
809
+
810
+ final_result = []
811
+
812
+ for run in run_identifiers :
813
+ logging .debug ("Looking up authorship for run {}" .format (run ))
814
+
815
+ # ERR1914274 has a pubmed ID associated
816
+ # <STUDY_LINKS>
817
+ # <STUDY_LINK>
818
+ # <XREF_LINK>
819
+ # <DB>PUBMED</DB>
820
+ # <ID>29669589</ID>
821
+
822
+ # Get the metadata for the run
823
+ metadata = SraMetadata ().efetch_sra_from_accessions ([run ])
824
+ # TODO: Do a single esearch and don't assume a result returned
825
+ m = metadata .iloc [0 ,:].to_dict ()
826
+ # TODO: Account for multiple IDs in the same DB - not sure of an example tho
827
+
828
+ to_print = {
829
+ 'Run' : run ,
830
+ }
831
+ if 'study_links' in m :
832
+ study_links_json = m ['study_links' ]
833
+ study_links = json .loads (study_links_json )
834
+ for link in study_links :
835
+ if 'db' in link :
836
+ db = link ['db' ]
837
+ del link ['db' ]
838
+ elif 'label' in link :
839
+ db = link ['label' ]
840
+ del link ['label' ]
841
+ else :
842
+ if 'Other study links in list' not in to_print :
843
+ to_print ['Other study links in list' ] = []
844
+ to_print ['Other study links in list' ].append (link )
845
+
846
+ if db == 'pubmed' :
847
+ to_print ['PubMed ID' ] = link ['id' ]
848
+ elif db == 'GOLD' :
849
+ to_print ['GOLD ID' ] = link ['url' ]
850
+ else :
851
+ if 'Other study links' not in to_print :
852
+ to_print ['Other study links' ] = {}
853
+ content_name = list (link .keys ())[0 ]
854
+ to_print ['Other study links' ][db ] = link [content_name ]
855
+
856
+ # Search PubMed for a title the same as the project name
857
+ # e.g. Characterisation of a sponge microbiome using an integrative genome-centric approach
858
+ # SRR9841429
859
+ study_title = m ['study_title' ]
860
+ logging .debug ("Searching PubMed for title '{}'" .format (study_title ))
861
+ pubmeds_from_title = SraMetadata ().fetch_pubmed_ids_from_term (study_title )
862
+ if pubmeds_from_title :
863
+ to_print ['PubMed IDs from title' ] = ',' .join (pubmeds_from_title )
864
+
865
+ logging .debug ("Searching EuropePMC for title '{}'" .format (study_title ))
866
+ # TODO: The search for 'Characterisation of a sponge microbiome using an
867
+ # integrative genome-centric approach' gives poor results - better at
868
+ # PubMed. However, searching for 'sponge microbiome using an integrative
869
+ # genome-centric approach' does work. So maybe need to filter out common
870
+ # words?
871
+ citations_from_europe_pmc_title = SraMetadata ().fetch_citations_from_query_title (study_title )
872
+ # TODO: Account for papers without a DOI?
873
+ dois = [c ['doi' ] for c in citations_from_europe_pmc_title ]
874
+ if len (dois ) > 0 :
875
+ to_print ['DOIs from EuropePMC title search' ] = ',' .join (dois )
876
+
877
+ final_result .append (to_print )
878
+
879
+ # Search by bioproject accession e.g. for PRJEB22302 / ERR2108709
880
+ bioproject = m ['bioproject' ]
881
+ logging .debug ("Searching EuropePMC for bioproject accession '{}'" .format (bioproject ))
882
+ citations_from_europe_pmc_bioproject = SraMetadata ().fetch_citations_from_query_bioproject (bioproject )
883
+ dois = [c ['doi' ] for c in citations_from_europe_pmc_bioproject ]
884
+ if len (dois ) > 0 :
885
+ to_print ['DOIs from EuropePMC bioproject search' ] = ',' .join (dois )
886
+
887
+
888
+ # Write out table as CSV
889
+ final = pd .DataFrame (final_result )
890
+ final .to_csv (sys .stdout , index = False )
891
+
0 commit comments