ubuntu2004
#task4-group71#the ion statistics calculator2#Liyue Chang3#import necessary libraries4import argparse56#use argparse to define command lines7parser=argparse.ArgumentParser()8parser.add_argument("inputfile",9help="directory to the input file")10parser.add_argument("-i", "--minimum_mass_to_charge_ratio_range",11help="the minimum range of mass-to-charge of the peptide fragment", type=float,default=1000)12parser.add_argument("-a", "--maximum_mass_to_charge_ratio_range",13help="the maximum range of mass-to-charge of the peptide fragment", type=float,default=1500)14parser.add_argument("-w", "--window_size",15help="size of the sliding window", type=float, default=0.5)16parser.add_argument("-s", "--step_size",17help="size of steps the sliding window slides(must be smaller than window size)", type=float, default=0.2)18parser.add_argument("-m", "--match_pattern",19help="how the input amino acid matches the sequence", choices=['start with','contain'], default='contain')20parser.add_argument("-p", "--amino_acid_matches_sequence",21help="input amino acid matching the sequence",22choices=['K','N','R','T','S','I','M','Q','H','P','L','E','D','A','G','V','Y','S','C','W','F','no'],default='no')23parser.add_argument("create_file",24help="name of the output file")2526args = parser.parse_args()27filename=args.inputfile28min_mz=round(args.minimum_mass_to_charge_ratio_range,4)29max_mz=round(args.maximum_mass_to_charge_ratio_range,4)30window_size=args.window_size31step_size=args.step_size32match_pattern=args.match_pattern33pep_find=args.amino_acid_matches_sequence34create_file=args.create_file3536#lists of useful elements37protein_name=[]38mass_to_charge=[]39sequence=[]40output=open(create_file,'w')4142#define a function to read file and store data43def readfile(filename):44global output45readfile=open(filename,'r')46next(readfile)47lines=readfile.readlines()48list1=[]49cleavage_site=[]50for line in lines:51#use split to handle the data52data=line.split()53protein_name.append(data[0])54mass_to_charge.append(data[3])55cleavage_site.append(data[4])56sequence.append(data[5])57cleavage_site=set(cleavage_site)58for i in cleavage_site:59if i==0:60print("There is no missed cleavage site")61list1.append("no missed cleavage site")62else:63print("Missed cleavage site: ",i)64list1.append("cleavage site: ")65list1.append(i)66output.writelines([list1[0],str(list1[1]),'\n'])6768#define a function to count number of peptides69def count_num(min_mz,max_mz):70count_pep=071#use lists to store output data72list1=[]73list2=[]74list3=[]75global output76#use defined list mass_to_charge, loops and if statement to count77for m in mass_to_charge:78mz=float(m)79if ((min_mz<mz)&(mz<max_mz)):80count_pep+=181print("mass range:", "minimum:",round(min_mz,4),", maximum:",round(max_mz,4)," count:",count_pep)82list1.append(round(min_mz,4))83list2.append(round(max_mz,4))84list3.append(count_pep)85for i in zip(list1,list2,list3):86output.writelines([str(i[0]),'\t',str(i[1]),'\t',str(i[2]),'\n'])87return count_pep8889#define a function to find unique protein90def find_protein():91mz_num={}92uni_pr=[]93rep_pr={}94rep_pr1=[]95list1=[]96list2=[]97num_rep={}98num_all={}99cons_pr={}100cons_pr1=[]101num_rep1={}102uni_pr1=[]103global output,mass_to_charge,protein_name104105#record the frequency of matched m/z of sequences106for i in range(len(mass_to_charge)):107if mass_to_charge[i] in mz_num:108mz_num[mass_to_charge[i]]+=1109else:110mz_num[mass_to_charge[i]]=1111112#a dictionary containing all duplicated m/z and corresponding protein name113for key in mz_num:114if mz_num[key]!=1:115rep_pr[key]=[]116for key in rep_pr.keys():117for index,nums in enumerate(mass_to_charge):118if nums==key:119rep_pr[key].append(protein_name[index])120#a dictionary containing m/z within the range (1000,1500)121if float(key)<1500.0 and float(key)>1000.0:122cons_pr[key]=[]123for key in cons_pr.keys():124for index,nums in enumerate(mass_to_charge):125if nums==key:126cons_pr[key].append(protein_name[index])127128#compare protein with same m/z129for i in rep_pr:130check_pr=[]131check_p=[]132for i in rep_pr[i]:133check_pr.append(i)134check_p=set(check_pr)135if check_p!=1:136rep_pr1.append(i)137138#use dictionary to compare duplicated protein with all proteins and proteins within m/z range of 1000 and 1500139for rp in range(len(rep_pr1)):140if rep_pr1[rp] in num_rep:141num_rep[rep_pr1[rp]]+=1142else:143num_rep[rep_pr1[rp]]=1144for i in cons_pr:145check_pr1=[]146check_p1=[]147for i in cons_pr[i]:148check_pr1.append(i)149check_p1=set(check_pr)150if check_p!=1:151cons_pr1.append(i)152for cp in range(len(cons_pr1)):153if cons_pr1[cp] in num_rep1:154num_rep1[cons_pr1[cp]]+=1155else:156num_rep1[cons_pr1[cp]]=1157for pr in range(len(protein_name)):158if protein_name[pr] in num_all:159num_all[protein_name[pr]]+=1160else:161num_all[protein_name[pr]]=1162for m in num_rep.keys():163if m in num_all.keys():164if num_rep[m]<num_all[m]:165uni_pr.append(m)166for m in num_rep1.keys():167if m in num_all.keys():168if num_rep1[m]<num_all[m]:169uni_pr1.append(m)170if len(uni_pr)==1:171print(len(uni_pr)," protein has been unambiguously identified in all proteins. ")172#print(" ".join(str(i) for i in uni_pr))173else:174print(len(uni_pr)," proteins have been unambiguously identified in all proteins. ")175#print(",".join(str(i) for i in uni_pr))176if len(uni_pr1)==1:177print(len(uni_pr1)," protein has been unambiguously identified within 1000 to 1500 m/z range. ")178#print(" ".join(str(i) for i in uni_pr1))179else:180print(len(uni_pr1)," proteins have been unambiguously identified within 1000 to 1500 m/z range. ")181#print(",".join(str(i) for i in uni_pr1))182list1.append(len(uni_pr))183list1.append(len(uni_pr1))184output.writelines(["all unique proteins: ",str(list1[0]),'\n',"unique proteins within 1000 to 1500 m/z: ", str(list1[1]),'\n'])185186#define a function to match peptide187def match_peptide(pep_find,match_pattern):188list1=[]189list2=[]190match=[]191global output192if pep_find!="no":193#differentiate different match pattern194if match_pattern=="start with":195for s in sequence:196if s.startswith(pep_find):197match.append(s)198if len(match)==0:199print("There is no sequence starts with ",pep_find," in all peptides.")200else:201for s in sequence:202match.append(s)203if len(match)==0:204print("There is no sequence contains ",pep_find," in all peptides.")205if len(match)==1:206print("There is ",len(match)," sequence ",match_pattern, pep_find," in all peptides.")207else:208print("There are ",len(match)," sequences ",match_pattern, pep_find," in all peptides.")209list1.append("number of matched sequence")210list2.append(len(match))211for i in zip(list1,list2):212output.writelines([str(i[0]),'\t',str(i[1]),'\n'])213#main code starts here214readfile(filename)215match_peptide(pep_find,match_pattern)216find_protein()217output.writelines(['min_range','\t','max_range','\t','peps_in_range','\n'])218count_num(min_mz,max_mz)219#use while loop to do sliding window220while min_mz<max_mz:221window=min_mz+window_size222if window>max_mz:223window=max_mz224count_num(min_mz,window)225min_mz+=step_size226output.close()227228