#! /usr/bin/env python """ Let's write our own python parser to clean up the pdf (after pdftotext of course). Instructions: run pdftotext like this: $ pdftotext -f 9 -l 81 -raw -nopgbrk 04_06PU.PDF 04_06PU-3.txt then run the python parser like this: $ python ParseDict.py 04_06PU.txt dicomV3.dic """ import re,os """ PdfTextParser takes as input a text file (produced by pdftotext) and create as output a clean file (ready to be processed) by DicomV3Expander Warning: PdfTextParser does not expand: - (xxxx,xxxx to xxxx) xxxxxxxxxxxx or - (12xx, 3456) comment... """ class PdfTextParser: # Cstor def __init__(self): self._InputFilename = '' self._OutputFilename = '' self._Infile = 0 self._OutLines = [] self._PreviousBuffers = [] def SetInputFileName(self,s): self._InputFilename = s def SetOutputFileName(self,s): self._OutputFilename = s # Function returning if s is a comment for sure def IsAComment(self,s): #print s, len(s) if s == "Tag Name VR VM": return True elif s == "PS 3.6-2003": return True elif s == "PS 3.6-2004": return True patt = re.compile('^Page [0-9]+$') if( patt.match(s) ): return True return False def IsAStartingLine(self,s): patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*)$') if( patt.match(s) ): return True return False def IsAFullLine(self,s): patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*) [A-Z][A-Z] [0-9]$') if( patt.match(s) ): return True return False # FIXME this function could be avoided... def IsSuspicious(self,s): l = len(s) if l > 80: return True return False def AddOutputLine(self,s): assert not self.IsAComment(s) self._OutLines.append(s + '\n') def Open(self): self._Infile = file(self._InputFilename, 'r') for line in self._Infile.readlines(): line = line[:-1] # remove '\n' if not self.IsAComment( line ): if self.IsAStartingLine(line): #print "Previous buffer:",self._PreviousBuffers previousbuffer = ' '.join(self._PreviousBuffers) if self.IsAStartingLine(previousbuffer): if not self.IsSuspicious(previousbuffer): self.AddOutputLine(previousbuffer) else: # this case should not happen if I were to rewrite the # thing I should be able to clean that #print "Suspicious:", previousbuffer #print "List is:", self._PreviousBuffers s = self._PreviousBuffers[0] if self.IsAFullLine(s): # That means we have a weird line that does not start # as usual (xxxx,xxxx) therefore we tried constructing # a buffer using a the complete previous line... #print "Full line:", s self.AddOutputLine(s) s2 = ' '.join(self._PreviousBuffers[1:]) #print "Other Full line:", s2 self.AddOutputLine(s2) else: # we have a suspicioulsy long line, so what that could # happen, let's check: if self.IsAFullLine(previousbuffer): self.AddOutputLine(previousbuffer) else: # This is the only case where we do not add # previousbuffer to the _OutLines print "Suspicious and Not a full line:", s else: if previousbuffer: print "Not a buffer:", previousbuffer # We can clean buffer, since only the case 'suspicious' + # 'Not a full line' has not added buffer to the list self._PreviousBuffers = [] # In all cases save the line for potentially growing this line assert not self.IsAComment(line) self._PreviousBuffers.append(line) else: #print "Not a line",line assert not self.IsAComment(line) self._PreviousBuffers.append(line) else: #print "Comment:",line previousbuffer = ' '.join(self._PreviousBuffers) if previousbuffer and self.IsAStartingLine(previousbuffer): #print "This line is added:", previousbuffer self.AddOutputLine( previousbuffer ) else: #print "Line is comment:", line print "Buffer is:", previousbuffer # Ok this is a comment we can safely clean the buffer: self._PreviousBuffers = [] self.Write() def Write(self): outfile = file(self._OutputFilename, 'w') outfile.writelines( self._OutLines ) outfile.close() self._Infile.close() # Main function to call for parsing def Parse(self): self.Open() """ subclass """ class UIDParser(PdfTextParser): def IsAStartingLine(self,s): patt = re.compile('^1.2.840.10008.[0-9.]+ (.*)$') if( patt.match(s) ): return True #print "Is Not:", s return False def IsAFullLine(self,s): patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) PS ?[0-9].1?[0-9]$') if( patt.match(s) ): return True patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) Well-known frame of reference$') if( patt.match(s) ): return True patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) \\(Retired\\)$') if( patt.match(s) ): return True return False def IsAComment(self,s): if PdfTextParser.IsAComment(self,s): return True # else let's enhance the super class patt = re.compile('^SPM2 (.*) http(.*)$') if( patt.match(s) ): return True return False def AddOutputLine(self,s): if self.IsAFullLine(s): return PdfTextParser.AddOutputLine(self,s) print "Discarding:", s class TransferSyntaxParser(UIDParser): def IsAFullLine(self,s): patt = re.compile('^(.*) Transfer Syntax PS ?[0-9].1?[0-9]$') if patt.match(s): return UIDParser.IsAStartingLine(self,s) print "Not a TS:", s return False """ This class is meant to expand line like: - (xxxx,xxxx to xxxx) xxxxxxxxxxxx or - (12xx, 3456) comment... """ class DicomV3Expander: def __init__(self): self._InputFilename = '' self._OutputFilename = '' self._OutLines = [] def SetInputFileName(self,s): self._InputFilename = s def SetOutputFileName(self,s): self._OutputFilename = s # Function to turn into lower case a tag: # ex: (ABCD, EF01) -> (abcd, ef01) def LowerCaseTag(self,s): #print "Before:", s[:-1] patt = re.compile('^(\\([0-9a-fA-F]+,[0-9a-fA-F]+\\))(.*)$') m = patt.match(s) if m: s1 = m.group(1) s2 = m.group(2) return s1.lower() + s2 else: print "Impossible case:", s os.sys.exit(1) def AddOutputLine(self,s): if s.__class__ == list: for i in s: self._OutLines.append(i + '\n') else: self._OutLines.append(s + '\n') # Expand the line approriaetkly and also add it to the # _OutLines list def ExpandLine(self, s): assert s[-1] == '\n' s = s[:-1] # remove \n list = [] if self.NeedToExpansion(s, list): self.AddOutputLine(list) # list != [] elif self.NeedXXExpansion(s, list): self.AddOutputLine(list) # list != [] else: self.AddOutputLine(self.LowerCaseTag(s)) # If line is like: # (0020,3100 to 31FF) Source Image Ids RET def NeedToExpansion(self,s, list): patt = re.compile('^\\(([0-9a-fA-F]+),([0-9a-fA-F]+) to ([0-9a-fA-F]+)\\)(.*)$') m = patt.match(s) if m: #print m.groups() gr = m.group(1) el_start = '0x'+m.group(2) el_end = '0x'+m.group(3) for i in range(eval(el_start), eval(el_end)): el = hex(i)[2:] l = '('+gr+','+el+')'+m.group(4) list.append(l) return True return False # If line is like: # (50xx,1200) Number of Patient Related Studies IS 1 def NeedXXExpansion(self,s,list): patt = re.compile('^\\(([0-9a-fA-F]+)xx,([0-9a-fA-F]+)\\)(.*)$') m = patt.match(s) if m: #print m.groups() gr_start = m.group(1) el = m.group(2) #el_start = '0x'+m.group(2) #el_end = '0x'+m.group(3) start = '0x'+gr_start+'00' end = '0x'+gr_start+'FF' for i in range(eval(start), eval(end)): gr = hex(i)[2:] l = '('+gr+','+el+')'+m.group(3) #print l list.append(l) return True return False def Write(self): outfile = file(self._OutputFilename, 'w') outfile.writelines( self._OutLines ) outfile.close() def Expand(self): infile = file(self._InputFilename,'r') for line in infile.readlines(): # ExpandLine also LowerCase the line self.ExpandLine(line) # l is [1,n] lines self.Write() infile.close() if __name__ == "__main__": argc = len(os.sys.argv ) if ( argc < 3 ): print "Sorry, wrong list of args" os.sys.exit(1) #error inputfilename = os.sys.argv[1] outputfilename = os.sys.argv[2] tempfile = "/tmp/mytemp" """ dp = PdfTextParser() dp.SetInputFileName( inputfilename ) #dp.SetOutputFileName( outputfilename ) dp.SetOutputFileName( tempfile ) dp.Parse() exp = DicomV3Expander() exp.SetInputFileName( tempfile ) exp.SetOutputFileName( outputfilename ) exp.Expand() """ dp = TransferSyntaxParser() dp.SetInputFileName( inputfilename ) dp.SetOutputFileName( outputfilename ) dp.Parse() #print dp.IsAStartingLine( "(0004,1212) File-set Consistency Flag US 1\n" )