Dicts/ParseDict.py

   1 #! /usr/bin/env python
   2 """
   3 Let's write our own python parser to clean up the pdf (after
   4 pdftotext of course).
   5 Instructions: run pdftotext like this:
   6
   7 $ pdftotext -f 9 -l 81 -raw -nopgbrk 04_06PU.PDF 04_06PU-3.txt
   8
   9 then run the python parser like this:
  10
  11 $ python ParseDict.py 04_06PU.txt dicomV3.dic
  12 """
  13 import re,os
  14
  15 """
  16 PdfTextParser takes as input a text file (produced by pdftotext)
  17 and create as output a clean file (ready to be processed) by
  18 DicomV3Expander
  19 Warning: PdfTextParser does not expand:
  20 - (xxxx,xxxx to xxxx) xxxxxxxxxxxx
  21 or
  22 - (12xx, 3456) comment...
  23
  24 """
  25 class PdfTextParser:
  26   # Cstor
  27   def __init__(self):
  28     self._InputFilename = ''
  29     self._OutputFilename = ''
  30     self._Infile = 0
  31     self._OutLines = []
  32     self._PreviousBuffers = []
  33
  34   def SetInputFileName(self,s):
  35     self._InputFilename = s
  36
  37   def SetOutputFileName(self,s):
  38     self._OutputFilename = s
  39
  40   # Function returning if s is a comment for sure
  41   def IsAComment(self,s):
  42     #print s,  len(s)
  43     if s == "Tag Name VR VM":
  44       return True
  45     elif s == "PS 3.6-2003":
  46       return True
  47     elif s == "PS 3.6-2004":
  48       return True
  49     patt = re.compile('^Page [0-9]+$')
  50     if( patt.match(s) ):
  51       return True
  52     return False
  53
  54   def IsAStartingLine(self,s):
  55     patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*)$')
  56     if( patt.match(s) ):
  57       return True
  58     return False
  59
  60   def IsAFullLine(self,s):
  61     patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*) [A-Z][A-Z] [0-9]$')
  62     if( patt.match(s) ):
  63       return True
  64     return False
  65
  66   # FIXME this function could be avoided...
  67   def IsSuspicious(self,s):
  68     l = len(s)
  69     if l > 80:
  70       return True
  71     return False
  72
  73   def AddOutputLine(self,s):
  74     assert not self.IsAComment(s)
  75     self._OutLines.append(s + '\n')
  76
  77   def Open(self):
  78     self._Infile = file(self._InputFilename, 'r')
  79     for line in self._Infile.readlines():
  80       line = line[:-1] # remove '\n'
  81       if not self.IsAComment( line ):
  82         if self.IsAStartingLine(line):
  83           #print "Previous buffer:",self._PreviousBuffers
  84           previousbuffer = ' '.join(self._PreviousBuffers)
  85           if self.IsAStartingLine(previousbuffer):
  86             if not self.IsSuspicious(previousbuffer):
  87               self.AddOutputLine(previousbuffer)
  88             else:
  89               # this case should not happen if I were to rewrite the
  90               # thing I should be able to clean that
  91               #print "Suspicious:", previousbuffer
  92               #print "List is:", self._PreviousBuffers
  93               s = self._PreviousBuffers[0]
  94               if self.IsAFullLine(s):
  95                 # That means we have a weird line that does not start
  96                 # as usual (xxxx,xxxx) therefore we tried constructing
  97                 # a buffer using a the complete previous line...
  98                 #print "Full line:", s
  99                 self.AddOutputLine(s)
 100                 s2 = ' '.join(self._PreviousBuffers[1:])
 101                 #print "Other Full line:", s2
 102                 self.AddOutputLine(s2)
 103               else:
 104                 # we have a suspicioulsy long line, so what that could
 105                 # happen, let's check:
 106                 if self.IsAFullLine(previousbuffer):
 107                   self.AddOutputLine(previousbuffer)
 108                 else:
 109                   # This is the only case where we do not add
 110                   # previousbuffer to the _OutLines
 111                   print "Suspicious and Not a full line:", s
 112           else:
 113             if previousbuffer:
 114               print "Not a buffer:", previousbuffer
 115           # We can clean buffer, since only the case 'suspicious' +
 116           # 'Not a full line' has not added buffer to the list
 117           self._PreviousBuffers = []
 118           # In all cases save the line for potentially growing this line
 119           assert not self.IsAComment(line)
 120           self._PreviousBuffers.append(line)
 121         else:
 122           #print "Not a line",line
 123           assert not self.IsAComment(line)
 124           self._PreviousBuffers.append(line)
 125       else:
 126         #print "Comment:",line
 127         previousbuffer = ' '.join(self._PreviousBuffers)
 128         if previousbuffer and self.IsAStartingLine(previousbuffer):
 129           #print "This line is added:", previousbuffer
 130           self.AddOutputLine( previousbuffer )
 131         else:
 132           #print "Line is comment:", line
 133           print "Buffer is:", previousbuffer
 134         # Ok this is a comment we can safely clean the buffer:
 135         self._PreviousBuffers = []
 136     self.Write()
 137
 138   def Write(self):
 139     outfile = file(self._OutputFilename, 'w')
 140     outfile.writelines( self._OutLines )
 141     outfile.close()
 142     self._Infile.close()
 143
 144   # Main function to call for parsing
 145   def Parse(self):
 146     self.Open()
 147
 148 """
 149 subclass
 150 """
 151 class UIDParser(PdfTextParser):
 152   def IsAStartingLine(self,s):
 153     patt = re.compile('^1.2.840.10008.[0-9.]+ (.*)$')
 154     if( patt.match(s) ):
 155       return True
 156     #print "Is Not:", s
 157     return False
 158
 159   def IsAFullLine(self,s):
 160     patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) PS ?[0-9].1?[0-9]$')
 161     if( patt.match(s) ):
 162       return True
 163     patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) Well-known frame of reference$')
 164     if( patt.match(s) ):
 165       return True
 166     patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) \\(Retired\\)$')
 167     if( patt.match(s) ):
 168       return True
 169     return False
 170
 171   def IsAComment(self,s):
 172     if PdfTextParser.IsAComment(self,s):
 173       return True
 174     # else let's enhance the super class
 175     patt = re.compile('^SPM2 (.*) http(.*)$')
 176     if( patt.match(s) ):
 177       return True
 178     return False
 179
 180   def AddOutputLine(self,s):
 181     if self.IsAFullLine(s):
 182       return PdfTextParser.AddOutputLine(self,s)
 183     print "Discarding:", s
 184
 185
 186 """
 187 TransferSyntaxParser
 188 """
 189 class TransferSyntaxParser(UIDParser):
 190   def IsAFullLine(self,s):
 191     patt = re.compile('^(.*) Transfer Syntax PS ?[0-9].1?[0-9]$')
 192     if patt.match(s):
 193       return UIDParser.IsAStartingLine(self,s)
 194     print "Not a TS:", s
 195     return False
 196
 197 """
 198 Papyrus parser
 199 pdftotext -f 19 -l 41 -raw -nopgbrk /tmp/Papyrus31Specif.pdf /tmp/Papyrus31Specif.txt
 200
 201 I need to do a second pass for pages:
 202 #29 since I need to find [0-9.]+
 203 #40,41 since it start with number in two columns !!
 204 """
 205 class PapyrusParser(PdfTextParser):
 206   def __init__(self):
 207     self._PreviousPage = 0
 208     self._PreviousNumber = 0
 209     PdfTextParser.__init__(self)
 210
 211   def IsAStartingLine(self,s):
 212     patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
 213     if( patt.match(s) ):
 214       return True
 215     return False
 216
 217   def IsAFullLine(self,s):
 218     patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
 219     if( patt.match(s) ):
 220       return True
 221     return False
 222
 223   def IsAComment(self,s):
 224     # dummy case:
 225     if s == 'Attribute Name Tag Type Attribute Description':
 226       print "Dummy", s
 227       return True
 228     # Indicate page #, spaces ending with only one number
 229     # Sometime there is a line with only one number, we need to
 230     # make sure that page # is strictly increasing
 231     patt = re.compile('^[1-9][0-9]+$')
 232     if( patt.match(s) ):
 233       if( eval(s) > self._PreviousPage):
 234         print "Page #", eval(s)
 235         self._PreviousNumber = 0
 236         self._PreviousPage = eval(s)
 237         return True
 238     # Now within each page there is a comment that start with a #
 239     # let's do the page approach wich reset at each page
 240     patt = re.compile('^[0-9]+$')
 241     if( patt.match(s) ):
 242       print "Number #", eval(s)
 243       self._PreviousNumber = eval(s)
 244       return True
 245     return False
 246
 247   def AddOutputLine(self,s):
 248     assert not self.IsAComment(s)
 249     s = s.replace('\n','')
 250     #print "REMOVE return:", s
 251     patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) ([0-9C]+) (.*)$')
 252     m = patt.match(s)
 253     ss = 'dummy (0000,0000) 0'
 254     if m:
 255       ss = m.group(2) + ' ' + m.group(3) + ' ' + m.group(1)
 256     else:
 257       patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) (.*)$')
 258       m = patt.match(s)
 259       if m:
 260         ss = m.group(2) + ' 0 ' + m.group(1)
 261     self._OutLines.append(ss + '\n')
 262
 263   def Open(self):
 264     self._Infile = file(self._InputFilename, 'r')
 265     for line in self._Infile.readlines():
 266       line = line[:-1] # remove '\n'
 267       if not self.IsAComment( line ):
 268         if self.IsAStartingLine(line):
 269           #print "Previous buffer:",self._PreviousBuffers
 270           previousbuffer = ' '.join(self._PreviousBuffers)
 271           if self.IsAFullLine(previousbuffer):
 272             self.AddOutputLine(previousbuffer)
 273           else:
 274             if previousbuffer:
 275               print "Not a buffer:", previousbuffer
 276           # We can clean buffer, since only the case 'suspicious' +
 277           # 'Not a full line' has not added buffer to the list
 278           self._PreviousBuffers = []
 279           # In all cases save the line for potentially growing this line
 280           # just to be safe remove any white space at begining of string
 281           assert not self.IsAComment(line)
 282           self._PreviousBuffers.append(line.strip())
 283         else:
 284           #print "Not a line",line
 285           assert not self.IsAComment(line)
 286           # just to be safe remove any white space at begining of string
 287           self._PreviousBuffers.append(line.strip())
 288       else:
 289         #print "Previous buffer:",self._PreviousBuffers
 290         previousbuffer = ' '.join(self._PreviousBuffers)
 291         if previousbuffer and self.IsAStartingLine(previousbuffer):
 292           #print "This line is added:", previousbuffer
 293           self.AddOutputLine( previousbuffer )
 294 #        else:
 295 #          #print "Line is comment:", line
 296 #          print "Buffer is:", previousbuffer
 297         # Ok this is a comment we can safely clean the buffer:
 298         self._PreviousBuffers = []
 299     self.Write()
 300
 301 """
 302 This class is meant to expand line like:
 303 - (xxxx,xxxx to xxxx) xxxxxxxxxxxx
 304 or
 305 - (12xx, 3456) comment...
 306
 307 """
 308 class DicomV3Expander:
 309   def __init__(self):
 310     self._InputFilename = ''
 311     self._OutputFilename = ''
 312     self._OutLines = []
 313
 314   def SetInputFileName(self,s):
 315     self._InputFilename = s
 316
 317   def SetOutputFileName(self,s):
 318     self._OutputFilename = s
 319
 320   # Function to turn into lower case a tag:
 321   # ex: (ABCD, EF01) -> (abcd, ef01)
 322   def LowerCaseTag(self,s):
 323     #print "Before:", s[:-1]
 324     patt = re.compile('^(\\([0-9a-fA-F]+,[0-9a-fA-F]+\\))(.*)$')
 325     m = patt.match(s)
 326     if m:
 327       s1 = m.group(1)
 328       s2 = m.group(2)
 329       return s1.lower() + s2
 330     else:
 331       print "Impossible case:", s
 332       os.sys.exit(1)
 333
 334   def AddOutputLine(self,s):
 335     if s.__class__ == list:
 336       for i in s:
 337         self._OutLines.append(i + '\n')
 338     else:
 339       self._OutLines.append(s + '\n')
 340
 341   # Expand the line approriaetkly and also add it to the
 342   # _OutLines list
 343   def ExpandLine(self, s):
 344     assert s[-1] == '\n'
 345     s = s[:-1]  # remove \n
 346     list = []
 347     if self.NeedToExpansion(s, list):
 348       self.AddOutputLine(list) # list != []
 349     elif self.NeedXXExpansion(s, list):
 350       self.AddOutputLine(list) # list != []
 351     else:
 352       self.AddOutputLine(self.LowerCaseTag(s))
 353
 354   # If line is like:
 355   # (0020,3100 to 31FF) Source Image Ids RET
 356   def NeedToExpansion(self,s, list):
 357     patt = re.compile('^\\(([0-9a-fA-F]+),([0-9a-fA-F]+) to ([0-9a-fA-F]+)\\)(.*)$')
 358     m = patt.match(s)
 359     if m:
 360       #print m.groups()
 361       gr = m.group(1)
 362       el_start = '0x'+m.group(2)
 363       el_end = '0x'+m.group(3)
 364       for i in range(eval(el_start), eval(el_end)):
 365         el = hex(i)[2:]
 366         l = '('+gr+','+el+')'+m.group(4)
 367         list.append(l)
 368       return True
 369     return False
 370
 371   # If line is like:
 372   # (50xx,1200) Number of Patient Related Studies IS 1
 373   def NeedXXExpansion(self,s,list):
 374     patt = re.compile('^\\(([0-9a-fA-F]+)xx,([0-9a-fA-F]+)\\)(.*)$')
 375     m = patt.match(s)
 376     if m:
 377       #print m.groups()
 378       gr_start = m.group(1)
 379       el = m.group(2)
 380       #el_start = '0x'+m.group(2)
 381       #el_end = '0x'+m.group(3)
 382       start = '0x'+gr_start+'00'
 383       end   = '0x'+gr_start+'FF'
 384       for i in range(eval(start), eval(end)):
 385         gr = hex(i)[2:]
 386         l = '('+gr+','+el+')'+m.group(3)
 387         #print l
 388         list.append(l)
 389       return True
 390     return False
 391
 392   def Write(self):
 393     outfile = file(self._OutputFilename, 'w')
 394     outfile.writelines( self._OutLines )
 395     outfile.close()
 396
 397   def Expand(self):
 398     infile = file(self._InputFilename,'r')
 399     for line in infile.readlines():
 400       # ExpandLine also LowerCase the line
 401       self.ExpandLine(line) # l is [1,n] lines
 402     self.Write()
 403     infile.close()
 404
 405
 406 if __name__ == "__main__":
 407   argc = len(os.sys.argv )
 408   if ( argc < 3 ):
 409     print "Sorry, wrong list of args"
 410     os.sys.exit(1) #error
 411
 412   inputfilename = os.sys.argv[1]
 413   outputfilename = os.sys.argv[2]
 414   tempfile = "/tmp/mytemp"
 415   """
 416   dp = PdfTextParser()
 417   dp.SetInputFileName( inputfilename )
 418   #dp.SetOutputFileName( outputfilename )
 419   dp.SetOutputFileName( tempfile )
 420   dp.Parse()
 421
 422   exp = DicomV3Expander()
 423   exp.SetInputFileName( tempfile )
 424   exp.SetOutputFileName( outputfilename )
 425   exp.Expand()
 426
 427   dp = TransferSyntaxParser()
 428   dp.SetInputFileName( inputfilename )
 429   dp.SetOutputFileName( outputfilename )
 430   dp.Parse()
 431   """
 432   dp = PapyrusParser()
 433   dp.SetInputFileName( inputfilename )
 434   dp.SetOutputFileName( outputfilename )
 435   dp.Parse()
 436
 437   #print dp.IsAStartingLine( "(0004,1212) File-set Consistency Flag US 1\n" )