Package install :: Package MoSTBioDat :: Package DataBase :: Package ImportData :: Package Data2DB :: Module SDFile
[hide private]
[frames] | no frames]

Source Code for Module install.MoSTBioDat.DataBase.ImportData.Data2DB.SDFile

  1  #!/usr/bin/env python 
  2  ######################### 
  3  # SDFile.py             # 
  4  # Parse SDF file        # 
  5  ######################### 
  6   
  7  ###################################################### 
  8  # Copyright (c) 2007-2008 Andrzej Bak                # 
  9  # ARC Seibersdorf & University of Silesia            # 
 10  # Author: Andrzej Bak <Andrzej.Bak@us.edu.pl>        # 
 11  # License: GNU General Public License, version: 3    # 
 12  # URL: http://chemoinformatyka.us.edu.pl/mostbiodat/ # 
 13  # Version: 1, 06.01.2010                             # 
 14  ###################################################### 
 15   
 16  try: 
 17      import sys 
 18      import os 
 19      import shutil 
 20      import numpy 
 21      import anydbm 
 22      from openeye.oechem import * 
 23      from MoSTBioDat.DataBase.ImportData.Data2DB.DBFile import DBFile,InputDB,filterKeys 
 24      from MoSTBioDat.Log.MoSTBioDatLog import  MoSTBioDatLog 
 25  except ImportError,e: 
 26      print 'Error: %s' %e 
 27      sys.exit(1) 
 28   
 29  ########## SDF file class ########################### 
30 -class SDFile(DBFile):
31 """ 32 Simple SDF file parser 33 INPUT: 34 dictpath - str - path to dictionary 35 dictfilename - str - dictionary filename 36 dbfile - str - database file path 37 temporary - str - path to temporary dictionary file 38 39 format - string format for log handler 40 filter - filter object from logger object 41 datefmt - data/time format 42 path - directory path to log file 43 filename - log file name, default log 44 filemode - mode to open log file, default='a' 45 level - set root logger level to specified level 46 logfilelevel- set level to log file 47 48 OUTPUT: 49 dictionary object 50 """
51 - def __init__(self,sdfile=None,**kwargs):
52 DBFile.__init__(self,dbfile=sdfile,**kwargs) 53 try: 54 self.logobj=MoSTBioDatLog(**kwargs)#create logging object 55 self.log=self.logobj.getLogHandler()#create logging handler 56 except IOError,e: 57 print 'Error: %s, %s' %(e[0],e[1]) 58 sys.exit(1)
59
60 - def parseMolOE(self,imolstream,**kwarg):
61 """ 62 get SDF fields from molecule using OpenEye 63 INPUT: 64 imolstream - OEChem input object 65 natoms - boolean, get number of atoms 66 nbonds - boolean, get number of bonds 67 coords - boolean, get coords list [ x y z atomic_number q ] 68 boonds - boolean, get bonds list [at1 at2 bondtype] 69 tag - boolean, get all tags if exists 70 - str, get specified tag 71 - list of strings, get specified tags 72 OUTPUT: 73 result dictionary 74 """ 75 ofs=oemolostream() 76 ofs.openstring() 77 ofs.SetFormat(OEFormat_ISM) 78 79 mol1=OEGraphMol() 80 while True: 81 try: 82 mol=imolstream.next() 83 except StopIteration: break 84 85 result={} 86 property={} 87 result['filename']=mol.GetTitle()#insert filename 88 89 OEWriteMolecule(ofs,mol)#write string to molecule 90 ofs.SetString('')#clean string 91 92 isosmi=self.CanSmi(mol=mol,iso=True,kek=False,verbose=True)#create isomeric smile 93 OEParseSmiles(mol1,isosmi) 94 isosmi=OECreateSmiString(mol1,OESMILESFlag_ISOMERIC) 95 mol1.Clear() 96 97 result['isosmi']=isosmi 98 99 cansmi=OECreateCanSmiString(mol)#create canonical smile 100 result['cansmi']=cansmi 101 102 absmi=OECreateAbsSmiString(mol)#create isomeric smile 103 result['absmi']=absmi#insert isomeric smile 104 105 self.CanMol(mol=mol,kek=False,aromodel=OEAroModelOpenEye,verbose=0) 106 107 if kwarg['natoms']: 108 result['natoms']=mol.NumAtoms() 109 110 if kwarg['nbonds']: 111 result['nbonds']=mol.NumBonds() 112 113 if kwarg['coords'] and kwarg['bonds']: 114 coords=[] 115 for atom in mol.GetAtoms():#iterate atoms 116 coord=[] 117 118 num=atom.GetIdx()+1#atom number in SDF 119 coord.append(num) 120 121 xyz=mol.GetCoords(atom)#atom coordinate 122 xyz=list(xyz)#change tuple to list 123 coord.extend(xyz) 124 125 sym=atom.GetAtomicNum()#atom symbol 126 coord.append(sym) 127 128 q=atom.GetFormalCharge()#atom charge 129 coord.append(q) 130 coords.append(coord) 131 result['coords']=coords 132 133 if kwarg['bonds'] and kwarg['coords']: 134 bonds=[] 135 for bond in mol.GetBonds():#iterate bonds 136 bnd=[] 137 begatom=bond.GetBgnIdx()+1#begin bond atom 138 endatom=bond.GetEndIdx()+1#end bond atom 139 if begatom>endatom: 140 begatom,endatom=endatom,begatom 141 bnd.append(begatom) 142 bnd.append(endatom) 143 bType=bond.GetIntType()#integer bond type 144 bnd.append(bType) 145 bonds.append(bnd) 146 result['bonds']=bonds 147 148 if not (kwarg['bonds'] and kwarg['coords']): 149 result['conntab'],result['symq']=self.Smi2ConnTab(smile=result['isosmi'],aromaticflag=OEAroModelOpenEye) 150 151 if kwarg['tag']: 152 result['property']=property 153 if isinstance(kwarg['tag'],(bool,int)): 154 for tag in OEGetSDDataPairs(mol): 155 property[tag.GetTag()]=tag.GetValue() 156 157 elif isinstance(kwarg['tag'],str): 158 if OEHasSDData(mol,kwarg['tag']): 159 property[kwarg['tag']]=OEGetSDData(mol,kwarg['tag']) 160 elif isinstance(kwarg['tag'],list): 161 for tagitem in kwarg['tag']: 162 if OEHasSDData(mol,tagitem): 163 property[tagitem]=OEGetSDData(mol,tagitem) 164 result['property']=property 165 166 mol.Clear()#order is important, clear molecule 167 yield result
168
169 - def Smi2ConnTab(self,smile,aromaticflag=OEAroModelOpenEye):
170 """ 171 Create canonical connection table from SMILE 172 INPUT: 173 class object 174 smile - str - SMILE code 175 OUTPUT: 176 connection table - list 177 """ 178 molecule=OEGraphMol() 179 OEParseSmiles(molecule,smile) 180 OEAssignAromaticFlags(molecule,aromaticflag) 181 OEFindRingAtomsAndBonds(molecule) 182 OEAddExplicitHydrogens(molecule) 183 self.CanMol(mol=molecule,kek=False,aromodel=aromaticflag,verbose=0) 184 connTab=[] 185 186 for bond in molecule.GetBonds():#iterate bonds 187 bnd=[] 188 begatom=bond.GetBgnIdx()+1#begin bond atom 189 endatom=bond.GetEndIdx()+1#end bond atom 190 if begatom>endatom: 191 begatom,endatom=endatom,begatom 192 bnd.append(begatom) 193 bnd.append(endatom) 194 bType=bond.GetOrder()#integer bond type 195 bnd.append(bType) 196 connTab.append(bnd) 197 198 atSymQ=[] 199 for atom in molecule.GetAtoms(): 200 asq=[] 201 num=atom.GetIdx()+1 202 asq.append(num) 203 sym=atom.GetAtomicNum() 204 asq.append(sym) 205 q=atom.GetFormalCharge()#atom charge 206 asq.append(q) 207 atSymQ.append(asq) 208 return connTab,atSymQ
209
210 - def CanSmi(self,mol,iso,kek,verbose):
211 """ 212 Create canonical smile (unique or absolute) 213 INPUT: 214 mol - molecule class object 215 iso - boolean, create isomeric SMILE 216 kek - boolean, Kekule aromatic form 217 varbose - boolean, show warnings 218 OUTPUT: 219 """ 220 smiflag=OESMILESFlag_Canonical 221 if iso: 222 smiflag|=OESMILESFlag_ISOMERIC 223 OEPerceiveChiral(mol) 224 for atom in mol.GetAtoms(): 225 IsChiral=atom.IsChiral() 226 HasStereo=atom.HasStereoSpecified() 227 228 if HasStereo and not IsChiral: 229 if verbose: 230 OEThrow.Warning("correcting atom stereo inconsistency!") 231 nbrs=[] 232 for bond in atom.GetBonds(): 233 nbrs.append(bond.GetNbr(atom)) 234 atom.SetStereo(nbrs,OEAtomStereo_Tetrahedral,OEBondStereo_Undefined) 235 236 for bond in mol.GetBonds(): 237 if bond.GetOrder()!=2: continue 238 HasStereoSpecified=bond.HasStereoSpecified() 239 IsChiral=bond.IsChiral() 240 if HasStereoSpecified and not IsChiral: 241 if verbose: 242 OEThrow.Warning("correcting bond stereo inconsistency!") 243 a1=bond.GetBgn() 244 a2=bond.GetEnd() 245 bond.SetStereo([a1,a2],OEBondStereo_CisTrans,OEBondStereo_Undefined) 246 247 if kek: 248 OEFindRingAtomsAndBonds(mol) 249 OEAssignAromaticFlags(mol,OEAroModelOpenEye) 250 for bond in mol.GetBonds(OEIsAromaticBond()): 251 bond.SetIntType(5) 252 OECanonicalOrderAtoms(mol) 253 OECanonicalOrderBonds(mol) 254 OEClearAromaticFlags(mol) 255 OEKekulize(mol) 256 257 smi=OECreateSmiString(mol,smiflag) 258 return smi
259
260 - def CanMol(self,mol,kek=False,aromodel=OEAroModelOpenEye,verbose=0):
261 """ 262 Create canonical connection table 263 INPUT: 264 mol - molecule class object 265 kek - boolean, Kekule aromatic form 266 aromodel - aromatic OpenEye model, default OEAroModelOpenEye 267 varbose - boolean, show warnings 268 OUPTUT: 269 """ 270 for atom in mol.GetAtoms(): 271 if atom.HasStereoSpecified(OEAtomStereo_Tetra): 272 nbrs=OEAtomVector() 273 for bond in atom.GetBonds(): 274 nbrs.append(bond.GetNbr(atom)) 275 if len(nbrs)<3: OEThrow.Info('ERROR: stereo nbrs <3') 276 atom.SetStereo(nbrs,OEAtomStereo_Tetra,OEAtomStereo_Undefined) 277 278 for bond in mol.GetBonds(): 279 if bond.HasStereoSpecified(OEBondStereo_CisTrans): 280 nbrs=OEAtomVector() 281 nbrs.append(bond.GetBgn()) 282 nbrs.append(bond.GetEnd()) 283 bond.SetStereo(nbrs,OEBondStereo_CisTrans,OEBondStereo_Undefined) 284 OEMDLClearParity(mol) 285 OEMDLPerceiveParity(mol) ## Set MDL atom parity from OEChem stereo 286 paritytag=OEGetTag("MDLParity") 287 for atom in mol.GetAtoms(): 288 atom.SetIntData(paritytag,0) ## do not set the parity=3 289 OEMDLPerceiveBondStereo(mol) ## Set wedge/hash from OEChem stereo 290 OECanonicalOrderAtoms(mol) 291 OECanonicalOrderBonds(mol) 292 mol.Sweep() 293 294 if kek: 295 OEFindRingAtomsAndBonds(mol) 296 OEAssignAromaticFlags(mol,aromodel) 297 for bond in mol.GetBonds(OEIsAromaticBond()): 298 bond.SetIntType(5) 299 OEClearAromaticFlags(mol) 300 OEKekulize(mol) 301 return
302
303 - def parseSDFOE(self,**kwarg):
304 """ 305 parse ZINC SDF file using OEChem 306 INPUT: 307 oechem - boolean, get oechem number, default True 308 natoms - boolean, get number of atoms, default True 309 nbonds - boolean, get number of bonds, default True 310 chiral- boolean, get chiral flag, default True 311 coords - boolean, get coords list [ x y z symbol q ], default True 312 tag - boolean, get tags if exists, default True 313 overTag - boolean, flag to add or update tags in Property attribute 314 constattr - add constant attribute, default None 315 OUTPUT: 316 result dictionary 317 """ 318 keys=("natoms","nbonds","coords","tag","overTag","constattr","bonds") 319 filter(lambda key: filterKeys(kwarg,keys,key),kwarg.keys())#filter keys 320 if not kwarg.has_key('tag'): 321 kwarg.setdefault('tag',True) 322 323 if not kwarg.has_key('coords'): 324 kwarg.setdefault('coords',True) 325 326 if not kwarg.has_key('bonds'): 327 kwarg.setdefault('bonds',True) 328 329 if not kwarg.has_key('nbonds'): 330 kwarg.setdefault('nbonds',True) 331 332 if not kwarg.has_key('natoms'): 333 kwarg.setdefault('natoms',True) 334 335 if not kwarg.has_key('overTag'): 336 kwarg.setdefault('overTag',False) 337 338 if not kwarg.has_key('constattr'): 339 kwarg.setdefault('constattr',{}) 340 341 if not isinstance(kwarg['constattr'],dict): 342 print 'Error: Specify attribute dictionary!' 343 sys.exit(1) 344 345 ifs=oemolistream(self.kwargs['dbfile']) 346 sw=OEStopwatch()#time counter 347 sw.Start() 348 dots=OEDots(10000,500,'>> SDF molecule')#dots progress indicator 349 self.openDict() 350 if not self.openDict():#can not open shelve dictionary 351 sys.exit(1) 352 print 'Parsing %s, please wait ...' %self.kwargs['dbfile'] 353 self.log.info('Parsing SDF file: %s',self.kwargs['dbfile']) 354 nmol=0#number of read molecules 355 nmolins=0#number of inserted molecules into dictionary 356 notmolins=0#not inserted into dictionary 357 for moldict in self.parseMolOE(ifs.GetOEGraphMols(),**kwarg): 358 nmol+=1 359 dots.Update()#dots progress indicator update 360 if (kwarg['tag'] and kwarg['overTag']):#add or update Property attribute 361 if len(moldict['property']):#check length of property dictionary 362 tagval=moldict['isosmi']# 363 dictobj=self.getDict(tagval)#get dictionary object 364 if dictobj:#if exists 365 if dictobj.kwargs.has_key('property'):#if has key 366 dictobj.kwargs['property'].update(moldict['property']) 367 else: 368 dictobj.kwargs['property']=moldict['property'] 369 try: 370 self.insertDict(tagval,dictobj)#insert updated object into dictionary 371 nmolins+=1 372 except anydbm.error, error: 373 print 'Error: %s' %error 374 self.log.exception('Error: %s',error) 375 self.closeDict() 376 sys.exit(1) 377 else: 378 notmolins+=1 379 continue 380 else: 381 notmolins+=1 382 continue 383 else: 384 inobj=InputDB(**moldict) 385 if not (self.getDict(inobj.kwargs['isosmi'])):# insert obj into dictionary if absent 386 if kwarg['constattr']:#if has constattr 387 for constkey,constval in kwarg['constattr'].iteritems():#iterate const dictionary 388 if inobj.kwargs.has_key(constkey):#if key exists 389 continue 390 else: 391 inobj.kwargs[constkey]=constval#add attribute to object 392 393 try: 394 self.insertDict(inobj.kwargs['isosmi'],inobj) 395 nmolins+=1 396 except anydbm.error, error: 397 print 'Error: %s' %error 398 self.log.exception('Error: %s',error) 399 self.closeDict() 400 sys.exit(1) 401 else: 402 if kwarg['constattr']:#if has constattr 403 dictobj=self.getDict(inobj.kwargs['isosmi'])#get object from shelve dictionary 404 for constkey,constval in kwarg['constattr'].iteritems(): 405 if dictobj.kwargs.has_key(constkey):#if object has key 406 if isinstance(dictobj.kwargs[constkey], type(constval)):#check type conistence 407 408 if isinstance(constval,list):#if constant value is list 409 for constitem in constval:#iterate const attribute value 410 if constitem not in dictobj.kwargs[constkey]:#if value not in object attribure value 411 dictobj.kwargs[constkey].append(constitem)#list append 412 413 elif isinstance(constval,dict):#if constant value is dictionary 414 dictobj.kwargs[constkey].update(constval) 415 416 417 elif isinstance(constval,(int,float)):#if constant value is integer or float 418 if dictobj.kwargs[constkey]!=constval:#if object value differs from const value 419 dictobj.kwargs[constkey]=constval#exchange value 420 421 else:#type not defined 422 print 'Error: Unknown type!' 423 continue 424 425 try: 426 self.insertDict(inobj.kwargs['isosmi'],dictobj)#insert updated object into dictionary 427 except anydbm.error, error: 428 print 'Error: %s' %error 429 self.log.exception('Error: %s',error) 430 self.closeDict() 431 sys.exit(1) 432 433 else:#types not consistent 434 continue 435 else:#object has not attribute specified in constattr key 436 continue 437 notmolins+=1 438 continue#if object in dictionary 439 self.closeDict() 440 441 ifs.close() 442 dots.Total() 443 print "SDF time reading: %.2f s" %(sw.Elapsed()) 444 print "%s: %s molecules inserted, %s refused!" %(self.kwargs['dictfilename'],nmolins,notmolins) 445 self.log.info('Read %s molecules in %.2f s, %s: inserted %s, refused %s ',nmol,sw.Elapsed(),self.kwargs['dictfilename'],nmolins,notmolins) 446 self.logobj.rmLogHandler()
447
448 - def genCanSmi(self):
449 """ 450 generate canonical SMILE 451 INPUT: 452 class object 453 OUTPUT: 454 write canonical smile, filename 455 """ 456 ifs=oemolistream() 457 ifs.SetFormat(OEFormat_SDF) 458 ofs=oemolostream() 459 ofs.SetFormat(OEFormat_CAN) 460 if ifs.open(os.path.abspath(self.kwargs['dbfile'])): 461 for mol in ifs.GetOEMols(): 462 OEWriteMolecule(ofs,mol) 463 ifs.close() 464 else: 465 sys.stderr.write("Unable to open input file")
466
467 - def genIsoSmi(self):
468 """ 469 generate isomeric SMILE 470 INPUT: 471 class object 472 OUTPUT: 473 write isomeric smile, filename 474 """ 475 ifs=oemolistream() 476 ifs.SetFormat(OEFormat_SDF) 477 ofs=oemolostream() 478 ofs.SetFormat(OEFormat_ISM) 479 if ifs.open(os.path.abspath(self.kwargs['dbfile'])): 480 for mol in ifs.GetOEMols(): 481 OEWriteMolecule(ofs,mol) 482 ifs.close() 483 else: 484 sys.stderr.write("Unable to open input file")
485
486 - def genAbSmi(self):
487 """ 488 generate absolute (isomeric+unique) SMILE 489 INPUT: 490 class object 491 OUTPUT: 492 write absolute smile, filename 493 """ 494 ifs=oemolistream() 495 ifs.SetFormat(OEFormat_SDF) 496 ofs=oemolostream() 497 ofs.SetFormat(OEFormat_SMI) 498 if ifs.open(os.path.abspath(self.kwargs['dbfile'])): 499 for mol in ifs.GetOEMols(): 500 OEWriteMolecule(ofs,mol) 501 ifs.close() 502 else: 503 sys.stderr.write("Unable to open input file")
504 505 ############### End of class ########################################## 506 ############ MAIN ##################################################### 507 ############ example of usage ######################################### 508 if __name__=='__main__': 509 pass 510 # A=SDFile('/tmp/13normalpH.sdf',dictpath='/tmp/InpuTest1',dictfilename='ZINCStdInp0',path='/tmp/Log',filename='dbfile') 511 # A=SDFile('/tmp/ZINC/ZINC.sdf',dictpath='/tmp/InputZINC',path='/tmp/Log',filename='dbfile') 512 # A=SDFile('/tmp/ZINC/PubChem.sdf',dictpath='/tmp/InputZINC',path='/tmp/Log',filename='dbfile') 513 # ### SDF Parser based on OpenEye OEChem ### 514 # A.parseSDFOE() 515 # A.showDict() 516 # A.parseSDFOE(tag=False,coords=True,bonds=True) 517 # A.parseSDFOE(coords=False,bonds=False,tag=False) 518 # A.parseSDFOE(tag=True,coords=True,constattr={'pH':[1,[2.0,4.5]]}) 519 # A.parseSDFOE(tag=True,coords=True,bonds=False) 520 # 521 # # ADD only specified tag, tag=True ALL, tag=str, tag=[list(str)] 522 # A.parseSDFOE(coords=False,nbonds=False,natoms=False,tag=True) 523 # A.parseSDFOE(coords=False,nbonds=False,natoms=False,tag='PUBCHEM_OPENEYE_CAN_SMILES') 524 # A.parseSDFOE(coords=False,nbonds=False,natoms=False,tag=['PUBCHEM_OPENEYE_CAN_SMILES','PUBCHEM_EXACT_MASS','something']) 525 # 526 # # IMPORTANT FLAG overTag OVERWRITE specified flag if object exists in dictionary 527 # A=SDFile('/tmp/ZINC/PubChem.sdf',dictpath='/home/abak/InputDB',path='/tmp/Log',filename='dbfile') 528 # A.parseSDFOE(coords=False,nbonds=False,natoms=False,tag=True,overTag=True) 529 # A.parseSDFOE(coords=False,nbonds=False,natoms=False,tag='PUBCHEM_COMPOUND_CID',overTag=True) 530 # A.parseSDFOE(coords=False,nbonds=False,natoms=False,tag=['PUBCHEM_COMPOUND_CID','PUBCHEM_COMPOUND_CANONICALIZED'],overTag=True) 531 # 532 # ## Shelve Dictionary operations ### 533 # A.insAttrDict(constattr={'pHlow':3.0,'pHhigh':9.75,'pHmid':False}) 534 # A.delAttrDict(['pHlow','pHhigh','pHmid']) 535 # A.showDict() 536 # A.openDict() 537 # print A.getLength() 538 # print A.getDict('CC(C)C[C@@H]1C(=O)N(C(=S)N1)CC=C') 539 # A.closeDict() 540 # A.showDictKey('CC(C)C[C@@H]1C(=O)N(C(=S)N1)CC=C') 541 # A.genCanSmi() 542 # A.genIsoSmi() 543 # A.genAbSmi() 544