3 Let's write our own python parser to clean up the pdf (after
5 Instructions: run pdftotext like this:
7 $ pdftotext -f 9 -l 81 -raw -nopgbrk 04_06PU.PDF 04_06PU-3.txt
9 then run the python parser like this:
11 $ python ParseDict.py 04_06PU.txt dicomV3.dic
16 PdfTextParser takes as input a text file (produced by pdftotext)
17 and create as output a clean file (ready to be processed) by
19 Warning: PdfTextParser does not expand:
20 - (xxxx,xxxx to xxxx) xxxxxxxxxxxx
22 - (12xx, 3456) comment...
28 self._InputFilename = ''
29 self._OutputFilename = ''
32 self._PreviousBuffers = []
34 def SetInputFileName(self,s):
35 self._InputFilename = s
37 def SetOutputFileName(self,s):
38 self._OutputFilename = s
40 # Function returning if s is a comment for sure
41 def IsAComment(self,s):
43 if s == "Tag Name VR VM":
45 elif s == "PS 3.6-2003":
47 elif s == "PS 3.6-2004":
49 patt = re.compile('^Page [0-9]+$')
54 def IsAStartingLine(self,s):
55 patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*)$')
60 def IsAFullLine(self,s):
61 patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*) [A-Z][A-Z] [0-9]$')
66 # FIXME this function could be avoided...
67 def IsSuspicious(self,s):
73 def AddOutputLine(self,s):
74 assert not self.IsAComment(s)
75 self._OutLines.append(s + '\n')
78 self._Infile = file(self._InputFilename, 'r')
79 for line in self._Infile.readlines():
80 line = line[:-1] # remove '\n'
81 if not self.IsAComment( line ):
82 if self.IsAStartingLine(line):
83 #print "Previous buffer:",self._PreviousBuffers
84 previousbuffer = ' '.join(self._PreviousBuffers)
85 if self.IsAStartingLine(previousbuffer):
86 if not self.IsSuspicious(previousbuffer):
87 self.AddOutputLine(previousbuffer)
89 # this case should not happen if I were to rewrite the
90 # thing I should be able to clean that
91 #print "Suspicious:", previousbuffer
92 #print "List is:", self._PreviousBuffers
93 s = self._PreviousBuffers[0]
94 if self.IsAFullLine(s):
95 # That means we have a weird line that does not start
96 # as usual (xxxx,xxxx) therefore we tried constructing
97 # a buffer using a the complete previous line...
98 #print "Full line:", s
100 s2 = ' '.join(self._PreviousBuffers[1:])
101 #print "Other Full line:", s2
102 self.AddOutputLine(s2)
104 # we have a suspicioulsy long line, so what that could
105 # happen, let's check:
106 if self.IsAFullLine(previousbuffer):
107 self.AddOutputLine(previousbuffer)
109 # This is the only case where we do not add
110 # previousbuffer to the _OutLines
111 print "Suspicious and Not a full line:", s
114 print "Not a buffer:", previousbuffer
115 # We can clean buffer, since only the case 'suspicious' +
116 # 'Not a full line' has not added buffer to the list
117 self._PreviousBuffers = []
118 # In all cases save the line for potentially growing this line
119 assert not self.IsAComment(line)
120 self._PreviousBuffers.append(line)
122 #print "Not a line",line
123 assert not self.IsAComment(line)
124 self._PreviousBuffers.append(line)
126 #print "Comment:",line
127 previousbuffer = ' '.join(self._PreviousBuffers)
128 if previousbuffer and self.IsAStartingLine(previousbuffer):
129 #print "This line is added:", previousbuffer
130 self.AddOutputLine( previousbuffer )
132 #print "Line is comment:", line
133 print "Buffer is:", previousbuffer
134 # Ok this is a comment we can safely clean the buffer:
135 self._PreviousBuffers = []
139 outfile = file(self._OutputFilename, 'w')
140 outfile.writelines( self._OutLines )
144 # Main function to call for parsing
151 class UIDParser(PdfTextParser):
152 def IsAStartingLine(self,s):
153 patt = re.compile('^1.2.840.10008.[0-9.]+ (.*)$')
159 def IsAFullLine(self,s):
160 patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) PS ?[0-9].1?[0-9]$')
163 patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) Well-known frame of reference$')
166 patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) \\(Retired\\)$')
171 def IsAComment(self,s):
172 if PdfTextParser.IsAComment(self,s):
174 # else let's enhance the super class
175 patt = re.compile('^SPM2 (.*) http(.*)$')
180 def AddOutputLine(self,s):
181 if self.IsAFullLine(s):
182 return PdfTextParser.AddOutputLine(self,s)
183 print "Discarding:", s
189 class TransferSyntaxParser(UIDParser):
190 def IsAFullLine(self,s):
191 patt = re.compile('^(.*) Transfer Syntax PS ?[0-9].1?[0-9]$')
193 return UIDParser.IsAStartingLine(self,s)
199 pdftotext -f 19 -l 41 -raw -nopgbrk /tmp/Papyrus31Specif.pdf /tmp/Papyrus31Specif.txt
201 I need to do a second pass for pages:
202 #29 since I need to find [0-9.]+
203 #40,41 since it start with number in two columns !!
205 class PapyrusParser(PdfTextParser):
207 self._PreviousPage = 0
208 self._PreviousNumber = 0
209 PdfTextParser.__init__(self)
211 def IsAStartingLine(self,s):
212 patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
215 # After page 39, lines are like:
216 patt = re.compile('^[0-9x]+ [0-9xA-F]+ .*$')
222 def IsAFullLine(self,s):
223 patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
226 # After page 39, lines are like:
227 patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9].*$')
233 def IsAComment(self,s):
235 if s == 'Attribute Name Tag Type Attribute Description':
238 patt = re.compile('^.*ANNEXE.*$')
241 # Indicate page #, spaces ending with only one number
242 # Sometime there is a line with only one number, we need to
243 # make sure that page # is strictly increasing
244 patt = re.compile('^[1-9][0-9]+$')
247 if( p > self._PreviousPage):
249 self._PreviousNumber = 0
250 self._PreviousPage = p
253 # print "PAGE ERROR:", s
254 # Now within each page there is a comment that start with a #
255 # let's do the page approach wich reset at each page
256 patt = re.compile('^[0-9]+$')
258 if( eval(s) > self._PreviousNumber):
259 #print "Number #", eval(s)
260 self._PreviousNumber = eval(s)
266 def AddOutputLine(self,s):
267 assert not self.IsAComment(s)
268 s = s.replace('\n','')
269 #print "REMOVE return:", s
270 patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) ([0-9C]+) (.*)$')
272 ss = 'dummy (0000,0000) 0'
274 ss = m.group(2) + ' ' + m.group(3) + ' ' + m.group(1)
276 patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) (.*)$')
279 ss = m.group(2) + ' 0 ' + m.group(1)
282 # There is two case one that end with all capital letter
283 # explaining the 'DEFINED TERMS'
284 patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9] [A-Z, ]$')
285 #patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9]|1\\-n [A-Z, |3.0]+$')
286 #patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [01n-] [A-Z, |3.0]+$')
290 self._OutLines.append(ss + '\n')
293 self._Infile = file(self._InputFilename, 'r')
294 for line in self._Infile.readlines():
295 line = line[:-1] # remove '\n'
296 if not self.IsAComment( line ):
297 if self.IsAStartingLine(line):
298 #print "Previous buffer:",self._PreviousBuffers
299 previousbuffer = ' '.join(self._PreviousBuffers)
300 if self.IsAFullLine(previousbuffer):
301 self.AddOutputLine(previousbuffer)
304 print "Not a buffer:", previousbuffer
305 # We can clean buffer, since only the case 'suspicious' +
306 # 'Not a full line' has not added buffer to the list
307 self._PreviousBuffers = []
308 # In all cases save the line for potentially growing this line
309 # just to be safe remove any white space at begining of string
310 assert not self.IsAComment(line)
311 self._PreviousBuffers.append(line.strip())
313 #print "Not a line",line
314 assert not self.IsAComment(line)
315 # just to be safe remove any white space at begining of string
316 self._PreviousBuffers.append(line.strip())
318 #print "Previous buffer:",self._PreviousBuffers
319 previousbuffer = ' '.join(self._PreviousBuffers)
320 if previousbuffer and self.IsAStartingLine(previousbuffer):
321 #print "This line is added:", previousbuffer
322 self.AddOutputLine( previousbuffer )
324 # #print "Line is comment:", line
325 # print "Buffer is:", previousbuffer
326 # Ok this is a comment we can safely clean the buffer:
327 self._PreviousBuffers = []
332 GE Medical Systems HISPEED ADVANTAGE CT/i CONFORMANCE STATEMENT
333 pdftotext -f 81 -l 90 -raw -nopgbrk 2162114_100r5.pdf 2162114_100r5.txt
335 class GEMSParser(PdfTextParser):
336 # def __init__(self):
337 # PdfTextParser.__init__(self)
339 def IsAStartingLine(self,s):
340 #patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
341 patt = re.compile('^[A-Za-z0-9 .#(),_/-]+ +\\([0-9A-F]+, ?[0-9A-F]+\\) +(.*)$')
346 def IsAFullLine(self,s):
347 #patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
348 patt = re.compile('^[A-Za-z0-9 .#(),_/-]+ +\\([0-9A-F]+, ?[0-9A-F]+\\) [A-Z][A-Z] [0-9]+$')
354 def IsAComment(self,s):
355 if PdfTextParser.IsAComment(self,s):
357 #patt = re.compile('^.*GE Medical Systems LightSpeed QX/i CONFORMANCE STATEMENT REV 2.2 sm 2288567-100.*$')
360 patt = re.compile('^.*GE Medical Systems HISPEED ADVANTAGE CT/i CONFORMANCE STATEMENT.*$')
363 patt = re.compile('^GE Medical Systems LightSpeed QX/i CONFORMANCE STATEMENT.*$')
366 patt = re.compile('^Attribute Name Tag VR VM$')
369 patt = re.compile('^B.[1-9].*Private .*$')
372 patt = re.compile('^Table B.1.? .* Private .*$')
375 patt = re.compile('^Note :.*$')
378 patt = re.compile('^7.11.1$')
383 def AddOutputLine(self,s):
385 assert not self.IsAComment(s)
386 patt = re.compile('^([A-Za-z0-9 .#(),_/-]+) +\\(([0-9A-F]+), ?([0-9A-F]+)\\) ([A-Z][A-Z]) ([0-9]+)$')
389 ss = m.group(2).lower() + ' ' + m.group(3).lower() + ' ' + m.group(4) + ' ' + m.group(5) + ' ' + m.group(1)
390 self._OutLines.append(ss + '\n')
396 This class is meant to expand line like:
397 - (xxxx,xxxx to xxxx) xxxxxxxxxxxx
399 - (12xx, 3456) comment...
402 class DicomV3Expander:
404 self._InputFilename = ''
405 self._OutputFilename = ''
408 def SetInputFileName(self,s):
409 self._InputFilename = s
411 def SetOutputFileName(self,s):
412 self._OutputFilename = s
414 # Function to turn into lower case a tag:
415 # ex: (ABCD, EF01) -> (abcd, ef01)
416 def LowerCaseTag(self,s):
417 #print "Before:", s[:-1]
418 patt = re.compile('^(\\([0-9a-fA-F]+,[0-9a-fA-F]+\\))(.*)$')
423 return s1.lower() + s2
425 patt = re.compile('^[0-9a-fA-F]+ [0-9a-fA-F]+ [A-Z][A-Z] [0-9n-] .*$')
429 print "Impossible case:", s
432 def AddOutputLine(self,s):
433 if s.__class__ == list:
435 self._OutLines.append(i + '\n')
437 self._OutLines.append(s + '\n')
439 # Expand the line approriaetkly and also add it to the
441 def ExpandLine(self, s):
443 s = s[:-1] # remove \n
445 if self.NeedToExpansion(s, list):
446 self.AddOutputLine(list) # list != []
447 elif self.NeedGroupXXExpansion(s, list):
448 self.AddOutputLine(list) # list != []
449 elif self.NeedElemXXExpansion(s, list):
450 self.AddOutputLine(list) # list != []
452 self.AddOutputLine(self.LowerCaseTag(s))
455 # (0020,3100 to 31FF) Source Image Ids RET
456 def NeedToExpansion(self,s, list):
457 patt = re.compile('^\\(([0-9a-fA-F]+),([0-9a-fA-F]+) to ([0-9a-fA-F]+)\\)(.*)$')
462 el_start = '0x'+m.group(2)
463 el_end = '0x'+m.group(3)
464 for i in range(eval(el_start), eval(el_end)):
466 l = '('+gr+','+el+')'+m.group(4)
472 # (50xx,1200) Number of Patient Related Studies IS 1
473 def NeedGroupXXExpansion(self,s,list):
474 patt = re.compile('^\\(([0-9a-fA-F]+)xx,([0-9a-fA-F]+)\\)(.*)$')
478 gr_start = m.group(1)
480 #el_start = '0x'+m.group(2)
481 #el_end = '0x'+m.group(3)
482 start = '0x'+gr_start+'00'
483 end = '0x'+gr_start+'FF'
484 for i in range(eval(start), eval(end)):
486 l = '('+gr+','+el+')'+m.group(3)
493 # (2001,xx00) Number of Patient Related Studies IS 1
494 def NeedElemXXExpansion(self,s,list):
495 patt = re.compile('^([0-9a-fA-F]+) ([0-9a-fA-F]+)xx(.*)$')
500 el_start = m.group(2)
503 for i in range(eval(start), eval(end)):
505 l = '('+gr+','+el_start+el+')'+m.group(3)
510 patt = re.compile('^([0-9a-fA-F]+) xx([0-9a-fA-F]+)(.*)$')
515 el_start = m.group(2)
518 for i in range(eval(start), eval(end)):
520 l = '('+gr+','+el+el_start+')'+m.group(3)
527 outfile = file(self._OutputFilename, 'w')
528 outfile.writelines( self._OutLines )
532 infile = file(self._InputFilename,'r')
533 for line in infile.readlines():
534 # ExpandLine also LowerCase the line
535 self.ExpandLine(line) # l is [1,n] lines
540 Parse line from a philips document, line are like this:
542 Syncra Scan Type 2005,10A1 VR = CS, VM = 1
546 self._InputFilename = ''
547 self._OutputFilename = ''
549 def Reformat(self,s):
550 assert self.IsGood(s)
551 patt = re.compile("^([A-Za-z0-9 -]+) ([0-9A-Z]+),([0-9A-Z]+) VR = ([A-Z][A-Z]), VM = (.*)$")
554 dicom = m.group(2) + ' ' + m.group(3) + ' ' + m.group(4) + ' ' + m.group(5) + ' ' + m.group(1)
560 patt = re.compile("^[A-Za-z0-9 -]+ [0-9A-Z]+,[0-9A-Z]+ VR = [A-Z][A-Z], VM = .*$")
566 def SetInputFileName(self,s):
567 self._InputFilename = s
569 def SetOutputFileName(self,s):
570 self._OutputFilename = s
573 infile = file(self._InputFilename, 'r')
575 for line in infile.readlines():
576 print self.Reformat(line)
577 outLines.append( self.Reformat(line) + '\n' )
578 outfile = file(self._OutputFilename, 'w')
579 outfile.writelines( outLines )
583 Parse line from a dicom3tools document, line are like this:
585 (0003,0008) VERS="SSPI" VR="US" VM="1" Owner="SIEMENS ISI" Keyword="ISICommandField" Name="ISI Command Field"
587 class Dicom3ToolsParser:
589 self._InputFilename = ''
590 self._OutputFilename = ''
592 def Reformat(self,s):
593 assert self.IsGood(s)
594 patt = re.compile("^\(([0-9a-f]+),([0-9a-f]+)\)\s+VERS=\".*\"\s+VR=\"([A-Z][A-Z])\"\s+VM=\"(.*)\"\s+Owner=\".*\"\s+Keyword=\".*\"\s+Name=\"(.*)\"$")
598 # Apparently some have Name == '?', skip those
600 if name != '?' and name != '? ':
601 dicom = m.group(1) + ' ' + m.group(2) + ' ' + m.group(3) + ' ' + m.group(4) + ' ' + m.group(5)
609 #patt = re.compile("^\([0-9a-f]+,[0-9a-f]+\) VERS=\".*\" VR=\"[A-Z][A-Z]\" VM=\".*\" Owner=\".*\" Keyword=\".*\" Name=\".*\"$")
610 patt = re.compile("^\([0-9a-f]+,[0-9a-f]+\)\s+VERS=\".*\"\s+VR=\"[A-Z][A-Z]\"\s+VM=\".*\"\s+Owner=\".*\"\s+Keyword=\".*\"\s+Name=\".*\".*$")
616 def SetInputFileName(self,s):
617 self._InputFilename = s
619 def SetOutputFileName(self,s):
620 self._OutputFilename = s
623 infile = file(self._InputFilename, 'r')
625 for line in infile.readlines():
626 newline = self.Reformat(line)
629 outLines.append( newline + '\n' )
630 outfile = file(self._OutputFilename, 'w')
631 outfile.writelines( outLines )
635 Parse line from a PhilipsAdvance document, line are like this:
637 GE Advance Implementation Version Name (0009,1001) 3 LO 2 n/a
639 class GEAdvanceParser:
641 self._InputFilename = ''
642 self._OutputFilename = ''
644 def Reformat(self,s):
645 assert self.IsGood(s)
646 #patt = re.compile("^\(([0-9a-f]+),([0-9a-f]+)\)\s+VERS=\".*\"\s+VR=\"([A-Z][A-Z])\"\s+VM=\"(.*)\"\s+Owner=\".*\"\s+Keyword=\".*\"\s+Name=\"(.*)\"$")
647 patt = re.compile("^([A-Za-z0-9 ._>]+) \\(([0-9A-F]+),([0-9A-F]+)\\) [0-9] ([A-Z][A-Z]) ([0-9]) .*$")
651 dicom = m.group(2) + ' ' + m.group(3).lower() + ' ' + m.group(4) + ' ' + m.group(5) + ' ' + m.group(1)
657 #patt = re.compile("^\([0-9a-f]+,[0-9a-f]+\)\s+VERS=\".*\"\s+VR=\"[A-Z][A-Z]\"\s+VM=\".*\"\s+Owner=\".*\"\s+Keyword=\".*\"\s+Name=\".*\".*$")
658 patt = re.compile("^[A-Za-z0-9 ._>]+ \\([0-9A-F]+,[0-9A-F]+\\) [0-9] [A-Z][A-Z] [0-9] .*$")
664 def SetInputFileName(self,s):
665 self._InputFilename = s
667 def SetOutputFileName(self,s):
668 self._OutputFilename = s
671 infile = file(self._InputFilename, 'r')
673 for line in infile.readlines():
674 newline = self.Reformat(line)
677 outLines.append( newline + '\n' )
678 outfile = file(self._OutputFilename, 'w')
679 outfile.writelines( outLines )
682 if __name__ == "__main__":
683 argc = len(os.sys.argv )
685 print "Sorry, wrong list of args"
686 os.sys.exit(1) #error
688 inputfilename = os.sys.argv[1]
689 outputfilename = os.sys.argv[2]
690 tempfile = "/tmp/mytemp"
693 dp.SetInputFileName( inputfilename )
694 #dp.SetOutputFileName( outputfilename )
695 dp.SetOutputFileName( tempfile )
697 exp = DicomV3Expander()
698 #exp.SetInputFileName( tempfile )
699 exp.SetInputFileName( inputfilename )
700 exp.SetOutputFileName( outputfilename )
703 dp = TransferSyntaxParser()
704 dp.SetInputFileName( inputfilename )
705 dp.SetOutputFileName( outputfilename )
709 dp.SetInputFileName( inputfilename )
710 dp.SetOutputFileName( outputfilename )
714 dp.SetInputFileName( inputfilename )
715 dp.SetOutputFileName( outputfilename )
718 dp.SetInputFileName( inputfilename )
719 dp.SetOutputFileName( outputfilename )
723 dp = Dicom3ToolsParser()
724 dp.SetInputFileName( inputfilename )
725 dp.SetOutputFileName( outputfilename )
729 dp = GEAdvanceParser()
730 dp.SetInputFileName( inputfilename )
731 dp.SetOutputFileName( outputfilename )
735 #print dp.IsAStartingLine( "(0004,1212) File-set Consistency Flag US 1\n" )