3 Let's write our own python parser to clean up the pdf (after
5 Instructions: run pdftotext like this:
7 $ pdftotext -f 9 -l 81 -raw -nopgbrk 04_06PU.PDF 04_06PU-3.txt
9 then run the python parser like this:
11 $ python ParseDict.py 04_06PU.txt dicomV3.dic
16 PdfTextParser takes as input a text file (produced by pdftotext)
17 and create as output a clean file (ready to be processed) by
19 Warning: PdfTextParser does not expand:
20 - (xxxx,xxxx to xxxx) xxxxxxxxxxxx
22 - (12xx, 3456) comment...
28 self._InputFilename = ''
29 self._OutputFilename = ''
32 self._PreviousBuffers = []
34 def SetInputFileName(self,s):
35 self._InputFilename = s
37 def SetOutputFileName(self,s):
38 self._OutputFilename = s
40 # Function returning if s is a comment for sure
41 def IsAComment(self,s):
43 if s == "Tag Name VR VM":
45 elif s == "PS 3.6-2003":
47 elif s == "PS 3.6-2004":
49 patt = re.compile('^Page [0-9]+$')
54 def IsAStartingLine(self,s):
55 patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*)$')
60 def IsAFullLine(self,s):
61 patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*) [A-Z][A-Z] [0-9]$')
66 # FIXME this function could be avoided...
67 def IsSuspicious(self,s):
73 def AddOutputLine(self,s):
74 assert not self.IsAComment(s)
75 self._OutLines.append(s + '\n')
78 self._Infile = file(self._InputFilename, 'r')
79 for line in self._Infile.readlines():
80 line = line[:-1] # remove '\n'
81 if not self.IsAComment( line ):
82 if self.IsAStartingLine(line):
83 #print "Previous buffer:",self._PreviousBuffers
84 previousbuffer = ' '.join(self._PreviousBuffers)
85 if self.IsAStartingLine(previousbuffer):
86 if not self.IsSuspicious(previousbuffer):
87 self.AddOutputLine(previousbuffer)
89 # this case should not happen if I were to rewrite the
90 # thing I should be able to clean that
91 #print "Suspicious:", previousbuffer
92 #print "List is:", self._PreviousBuffers
93 s = self._PreviousBuffers[0]
94 if self.IsAFullLine(s):
95 # That means we have a weird line that does not start
96 # as usual (xxxx,xxxx) therefore we tried constructing
97 # a buffer using a the complete previous line...
98 #print "Full line:", s
100 s2 = ' '.join(self._PreviousBuffers[1:])
101 #print "Other Full line:", s2
102 self.AddOutputLine(s2)
104 # we have a suspicioulsy long line, so what that could
105 # happen, let's check:
106 if self.IsAFullLine(previousbuffer):
107 self.AddOutputLine(previousbuffer)
109 # This is the only case where we do not add
110 # previousbuffer to the _OutLines
111 print "Suspicious and Not a full line:", s
114 print "Not a buffer:", previousbuffer
115 # We can clean buffer, since only the case 'suspicious' +
116 # 'Not a full line' has not added buffer to the list
117 self._PreviousBuffers = []
118 # In all cases save the line for potentially growing this line
119 assert not self.IsAComment(line)
120 self._PreviousBuffers.append(line)
122 #print "Not a line",line
123 assert not self.IsAComment(line)
124 self._PreviousBuffers.append(line)
126 #print "Comment:",line
127 previousbuffer = ' '.join(self._PreviousBuffers)
128 if previousbuffer and self.IsAStartingLine(previousbuffer):
129 #print "This line is added:", previousbuffer
130 self.AddOutputLine( previousbuffer )
132 #print "Line is comment:", line
133 print "Buffer is:", previousbuffer
134 # Ok this is a comment we can safely clean the buffer:
135 self._PreviousBuffers = []
139 outfile = file(self._OutputFilename, 'w')
140 outfile.writelines( self._OutLines )
144 # Main function to call for parsing
151 class UIDParser(PdfTextParser):
152 def IsAStartingLine(self,s):
153 patt = re.compile('^1.2.840.10008.[0-9.]+ (.*)$')
159 def IsAFullLine(self,s):
160 patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) PS ?[0-9].1?[0-9]$')
163 patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) Well-known frame of reference$')
166 patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) \\(Retired\\)$')
171 def IsAComment(self,s):
172 if PdfTextParser.IsAComment(self,s):
174 # else let's enhance the super class
175 patt = re.compile('^SPM2 (.*) http(.*)$')
180 def AddOutputLine(self,s):
181 if self.IsAFullLine(s):
182 return PdfTextParser.AddOutputLine(self,s)
183 print "Discarding:", s
189 class TransferSyntaxParser(UIDParser):
190 def IsAFullLine(self,s):
191 patt = re.compile('^(.*) Transfer Syntax PS ?[0-9].1?[0-9]$')
193 return UIDParser.IsAStartingLine(self,s)
199 pdftotext -f 19 -l 41 -raw -nopgbrk /tmp/Papyrus31Specif.pdf /tmp/Papyrus31Specif.txt
201 I need to do a second pass for pages:
202 #29 since I need to find [0-9.]+
203 #40,41 since it start with number in two columns !!
205 class PapyrusParser(PdfTextParser):
207 self._PreviousPage = 0
208 self._PreviousNumber = 0
209 PdfTextParser.__init__(self)
211 def IsAStartingLine(self,s):
212 patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
215 # After page 39, lines are like:
216 patt = re.compile('^[0-9x]+ [0-9xA-F]+ .*$')
222 def IsAFullLine(self,s):
223 patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
226 # After page 39, lines are like:
227 patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9].*$')
233 def IsAComment(self,s):
235 if s == 'Attribute Name Tag Type Attribute Description':
238 patt = re.compile('^.*ANNEXE.*$')
241 # Indicate page #, spaces ending with only one number
242 # Sometime there is a line with only one number, we need to
243 # make sure that page # is strictly increasing
244 patt = re.compile('^[1-9][0-9]+$')
247 if( p > self._PreviousPage):
249 self._PreviousNumber = 0
250 self._PreviousPage = p
253 # print "PAGE ERROR:", s
254 # Now within each page there is a comment that start with a #
255 # let's do the page approach wich reset at each page
256 patt = re.compile('^[0-9]+$')
258 if( eval(s) > self._PreviousNumber):
259 #print "Number #", eval(s)
260 self._PreviousNumber = eval(s)
266 def AddOutputLine(self,s):
267 assert not self.IsAComment(s)
268 s = s.replace('\n','')
269 #print "REMOVE return:", s
270 patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) ([0-9C]+) (.*)$')
272 ss = 'dummy (0000,0000) 0'
274 ss = m.group(2) + ' ' + m.group(3) + ' ' + m.group(1)
276 patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) (.*)$')
279 ss = m.group(2) + ' 0 ' + m.group(1)
282 # There is two case one that end with all capital letter
283 # explaining the 'DEFINED TERMS'
284 patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9] [A-Z, ]$')
285 #patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9]|1\\-n [A-Z, |3.0]+$')
286 #patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [01n-] [A-Z, |3.0]+$')
290 self._OutLines.append(ss + '\n')
293 self._Infile = file(self._InputFilename, 'r')
294 for line in self._Infile.readlines():
295 line = line[:-1] # remove '\n'
296 if not self.IsAComment( line ):
297 if self.IsAStartingLine(line):
298 #print "Previous buffer:",self._PreviousBuffers
299 previousbuffer = ' '.join(self._PreviousBuffers)
300 if self.IsAFullLine(previousbuffer):
301 self.AddOutputLine(previousbuffer)
304 print "Not a buffer:", previousbuffer
305 # We can clean buffer, since only the case 'suspicious' +
306 # 'Not a full line' has not added buffer to the list
307 self._PreviousBuffers = []
308 # In all cases save the line for potentially growing this line
309 # just to be safe remove any white space at begining of string
310 assert not self.IsAComment(line)
311 self._PreviousBuffers.append(line.strip())
313 #print "Not a line",line
314 assert not self.IsAComment(line)
315 # just to be safe remove any white space at begining of string
316 self._PreviousBuffers.append(line.strip())
318 #print "Previous buffer:",self._PreviousBuffers
319 previousbuffer = ' '.join(self._PreviousBuffers)
320 if previousbuffer and self.IsAStartingLine(previousbuffer):
321 #print "This line is added:", previousbuffer
322 self.AddOutputLine( previousbuffer )
324 # #print "Line is comment:", line
325 # print "Buffer is:", previousbuffer
326 # Ok this is a comment we can safely clean the buffer:
327 self._PreviousBuffers = []
331 This class is meant to expand line like:
332 - (xxxx,xxxx to xxxx) xxxxxxxxxxxx
334 - (12xx, 3456) comment...
337 class DicomV3Expander:
339 self._InputFilename = ''
340 self._OutputFilename = ''
343 def SetInputFileName(self,s):
344 self._InputFilename = s
346 def SetOutputFileName(self,s):
347 self._OutputFilename = s
349 # Function to turn into lower case a tag:
350 # ex: (ABCD, EF01) -> (abcd, ef01)
351 def LowerCaseTag(self,s):
352 #print "Before:", s[:-1]
353 patt = re.compile('^(\\([0-9a-fA-F]+,[0-9a-fA-F]+\\))(.*)$')
358 return s1.lower() + s2
360 print "Impossible case:", s
363 def AddOutputLine(self,s):
364 if s.__class__ == list:
366 self._OutLines.append(i + '\n')
368 self._OutLines.append(s + '\n')
370 # Expand the line approriaetkly and also add it to the
372 def ExpandLine(self, s):
374 s = s[:-1] # remove \n
376 if self.NeedToExpansion(s, list):
377 self.AddOutputLine(list) # list != []
378 elif self.NeedXXExpansion(s, list):
379 self.AddOutputLine(list) # list != []
381 self.AddOutputLine(self.LowerCaseTag(s))
384 # (0020,3100 to 31FF) Source Image Ids RET
385 def NeedToExpansion(self,s, list):
386 patt = re.compile('^\\(([0-9a-fA-F]+),([0-9a-fA-F]+) to ([0-9a-fA-F]+)\\)(.*)$')
391 el_start = '0x'+m.group(2)
392 el_end = '0x'+m.group(3)
393 for i in range(eval(el_start), eval(el_end)):
395 l = '('+gr+','+el+')'+m.group(4)
401 # (50xx,1200) Number of Patient Related Studies IS 1
402 def NeedXXExpansion(self,s,list):
403 patt = re.compile('^\\(([0-9a-fA-F]+)xx,([0-9a-fA-F]+)\\)(.*)$')
407 gr_start = m.group(1)
409 #el_start = '0x'+m.group(2)
410 #el_end = '0x'+m.group(3)
411 start = '0x'+gr_start+'00'
412 end = '0x'+gr_start+'FF'
413 for i in range(eval(start), eval(end)):
415 l = '('+gr+','+el+')'+m.group(3)
422 outfile = file(self._OutputFilename, 'w')
423 outfile.writelines( self._OutLines )
427 infile = file(self._InputFilename,'r')
428 for line in infile.readlines():
429 # ExpandLine also LowerCase the line
430 self.ExpandLine(line) # l is [1,n] lines
435 Parse line from a philips document, line are like this:
437 Syncra Scan Type 2005,10A1 VR = CS, VM = 1
441 self._InputFilename = ''
442 self._OutputFilename = ''
444 def Reformat(self,s):
445 assert self.IsGood(s)
446 patt = re.compile("^([A-Za-z0-9 -]+) ([0-9A-Z]+),([0-9A-Z]+) VR = ([A-Z][A-Z]), VM = (.*)$")
449 dicom = m.group(2) + ' ' + m.group(3) + ' ' + m.group(4) + ' ' + m.group(5) + ' ' + m.group(1)
455 patt = re.compile("^[A-Za-z0-9 -]+ [0-9A-Z]+,[0-9A-Z]+ VR = [A-Z][A-Z], VM = .*$")
461 def SetInputFileName(self,s):
462 self._InputFilename = s
464 def SetOutputFileName(self,s):
465 self._OutputFilename = s
468 infile = file(self._InputFilename, 'r')
470 for line in infile.readlines():
471 print self.Reformat(line)
472 outLines.append( self.Reformat(line) + '\n' )
473 outfile = file(self._OutputFilename, 'w')
474 outfile.writelines( outLines )
478 if __name__ == "__main__":
479 argc = len(os.sys.argv )
481 print "Sorry, wrong list of args"
482 os.sys.exit(1) #error
484 inputfilename = os.sys.argv[1]
485 outputfilename = os.sys.argv[2]
486 tempfile = "/tmp/mytemp"
489 dp.SetInputFileName( inputfilename )
490 #dp.SetOutputFileName( outputfilename )
491 dp.SetOutputFileName( tempfile )
494 exp = DicomV3Expander()
495 exp.SetInputFileName( tempfile )
496 exp.SetOutputFileName( outputfilename )
499 dp = TransferSyntaxParser()
500 dp.SetInputFileName( inputfilename )
501 dp.SetOutputFileName( outputfilename )
504 dp.SetInputFileName( inputfilename )
505 dp.SetOutputFileName( outputfilename )
510 dp.SetInputFileName( inputfilename )
511 dp.SetOutputFileName( outputfilename )
514 #print dp.IsAStartingLine( "(0004,1212) File-set Consistency Flag US 1\n" )