33##################################
44import pandas as pd
55import os
6+ from .utils import load_and_subset_gff
67
78####################################
89### check MGEs annotations stats ###
@@ -14,6 +15,9 @@ def mges_stats(bacannot_summary):
1415
1516 # load dir of samples' results
1617 results_dir = bacannot_summary [sample ]['results_dir' ]
18+
19+ # load gff_file
20+ gff_file = f"{ results_dir } /gffs/{ sample } .gff"
1721
1822 # integron_finder
1923 if os .path .exists (f"{ results_dir } /integron_finder/{ sample } _integrons.gff" ) and os .stat (f"{ results_dir } /integron_finder/{ sample } _integrons.gff" ).st_size > 0 :
@@ -54,4 +58,56 @@ def mges_stats(bacannot_summary):
5458 bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['end' ] = row ['end' ]
5559 bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['type' ] = row ['atts' ].split (';' )[1 ].split ('=' )[- 1 ]
5660 bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['source' ] = row ['source' ]
57- bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['product' ] = row ['type' ]
61+ bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['product' ] = row ['type' ]
62+
63+ # ICE database
64+ ice_db_blastp = f"{ results_dir } /ICEs/{ sample } _iceberg_blastp_onGenes.summary.txt"
65+ if os .path .exists (ice_db_blastp ) and os .stat (ice_db_blastp ).st_size > 0 :
66+
67+ # init MGE annotation dictionary
68+ if 'MGE' not in bacannot_summary [sample ]:
69+ bacannot_summary [sample ]['MGE' ] = {}
70+
71+ # init iceberg annotation dictionary
72+ if 'ICE' not in bacannot_summary [sample ]['MGE' ]:
73+ bacannot_summary [sample ]['MGE' ]['ICEberg' ] = {}
74+
75+ # init iceberg blastp annotation dictionary
76+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ] = {}
77+
78+ # load integron_finder results
79+ results = pd .read_csv (
80+ ice_db_blastp ,
81+ sep = '\t '
82+ )
83+
84+ # load gff
85+ gff = load_and_subset_gff (gff_file , 'source' , 'ICE' )
86+
87+ # number of integron_finder annotations
88+ total_number = len (results .index )
89+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ]['total' ] = total_number
90+
91+ # per gene info
92+ if int (results .shape [0 ]) > 0 :
93+ for seq in [ str (x ) for x in results ['SEQUENCE' ].unique () ]:
94+
95+ # details missing in output but available in gff
96+ gff_row = gff [gff ['attributes' ].str .contains (seq )]
97+ contig = gff_row ['seq' ].item ()
98+ start = gff_row ['start' ].item ()
99+ end = gff_row ['end' ].item ()
100+
101+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ] = {}
102+ for index , row in results [results ['SEQUENCE' ] == seq ].reset_index ().iterrows ():
103+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ] = {}
104+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['id' ] = row ['ICEBERG_ID' ]
105+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['contig' ] = contig
106+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['start' ] = start
107+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['end' ] = end
108+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['accession' ] = row ['ACCESSION' ]
109+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['product' ] = row ['PRODUCT' ]
110+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['description' ] = row ['DESCRIPTION' ]
111+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['blast_start' ] = row ['START' ]
112+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['blast_end' ] = row ['END' ]
113+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['strand' ] = row ['STRAND' ]
0 commit comments