+"""
+Papyrus parser
+pdftotext -f 19 -l 41 -raw -nopgbrk /tmp/Papyrus31Specif.pdf /tmp/Papyrus31Specif.txt
+
+I need to do a second pass for pages:
+#29 since I need to find [0-9.]+
+#40,41 since it start with number in two columns !!
+"""
+class PapyrusParser(PdfTextParser):
+ def __init__(self):
+ self._PreviousPage = 0
+ self._PreviousNumber = 0
+ PdfTextParser.__init__(self)
+
+ def IsAStartingLine(self,s):
+ patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
+ if( patt.match(s) ):
+ return True
+ # After page 39, lines are like:
+ patt = re.compile('^[0-9x]+ [0-9xA-F]+ .*$')
+ if( patt.match(s) ):
+ #print "PAge 39", s
+ return True
+ return False
+
+ def IsAFullLine(self,s):
+ patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
+ if( patt.match(s) ):
+ return True
+ # After page 39, lines are like:
+ patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9].*$')
+ if( patt.match(s) ):
+ #print "PAge 39", s
+ return True
+ return False
+
+ def IsAComment(self,s):
+ # dummy case:
+ if s == 'Attribute Name Tag Type Attribute Description':
+ #print "Dummy", s
+ return True
+ patt = re.compile('^.*ANNEXE.*$')
+ if patt.match(s):
+ return True
+ # Indicate page #, spaces ending with only one number
+ # Sometime there is a line with only one number, we need to
+ # make sure that page # is strictly increasing
+ patt = re.compile('^[1-9][0-9]+$')
+ if( patt.match(s) ):
+ p = eval(s)
+ if( p > self._PreviousPage):
+ #print "Page #", p
+ self._PreviousNumber = 0
+ self._PreviousPage = p
+ return True
+# else:
+# print "PAGE ERROR:", s
+ # Now within each page there is a comment that start with a #
+ # let's do the page approach wich reset at each page
+ patt = re.compile('^[0-9]+$')
+ if( patt.match(s) ):
+ if( eval(s) > self._PreviousNumber):
+ #print "Number #", eval(s)
+ self._PreviousNumber = eval(s)
+ return True
+ #else:
+ # print "ERROR:", s
+ return False
+
+ def AddOutputLine(self,s):
+ assert not self.IsAComment(s)
+ s = s.replace('\n','')
+ #print "REMOVE return:", s
+ patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) ([0-9C]+) (.*)$')
+ m = patt.match(s)
+ ss = 'dummy (0000,0000) 0'
+ if m:
+ ss = m.group(2) + ' ' + m.group(3) + ' ' + m.group(1)
+ else:
+ patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) (.*)$')
+ m = patt.match(s)
+ if m:
+ ss = m.group(2) + ' 0 ' + m.group(1)
+ else:
+ ss = s
+ # There is two case one that end with all capital letter
+ # explaining the 'DEFINED TERMS'
+ patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9] [A-Z, ]$')
+ #patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9]|1\\-n [A-Z, |3.0]+$')
+ #patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [01n-] [A-Z, |3.0]+$')
+ if patt.match(s):
+ print "Match", s
+ ss = ''
+ self._OutLines.append(ss + '\n')
+
+ def Open(self):
+ self._Infile = file(self._InputFilename, 'r')
+ for line in self._Infile.readlines():
+ line = line[:-1] # remove '\n'
+ if not self.IsAComment( line ):
+ if self.IsAStartingLine(line):
+ #print "Previous buffer:",self._PreviousBuffers
+ previousbuffer = ' '.join(self._PreviousBuffers)
+ if self.IsAFullLine(previousbuffer):
+ self.AddOutputLine(previousbuffer)
+ else:
+ if previousbuffer:
+ print "Not a buffer:", previousbuffer
+ # We can clean buffer, since only the case 'suspicious' +
+ # 'Not a full line' has not added buffer to the list
+ self._PreviousBuffers = []
+ # In all cases save the line for potentially growing this line
+ # just to be safe remove any white space at begining of string
+ assert not self.IsAComment(line)
+ self._PreviousBuffers.append(line.strip())
+ else:
+ #print "Not a line",line
+ assert not self.IsAComment(line)
+ # just to be safe remove any white space at begining of string
+ self._PreviousBuffers.append(line.strip())
+ else:
+ #print "Previous buffer:",self._PreviousBuffers
+ previousbuffer = ' '.join(self._PreviousBuffers)
+ if previousbuffer and self.IsAStartingLine(previousbuffer):
+ #print "This line is added:", previousbuffer
+ self.AddOutputLine( previousbuffer )
+# else:
+# #print "Line is comment:", line
+# print "Buffer is:", previousbuffer
+ # Ok this is a comment we can safely clean the buffer:
+ self._PreviousBuffers = []
+ self.Write()
+
+"""
+Parser for:
+GE Medical Systems HISPEED ADVANTAGE CT/i CONFORMANCE STATEMENT
+pdftotext -f 81 -l 90 -raw -nopgbrk 2162114_100r5.pdf 2162114_100r5.txt
+"""
+class GEMSParser(PdfTextParser):
+# def __init__(self):
+# PdfTextParser.__init__(self)
+
+ def IsAStartingLine(self,s):
+ #patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
+ patt = re.compile('^[A-Za-z0-9 .#(),_/-]+ +\\([0-9A-F]+, ?[0-9A-F]+\\) +(.*)$')
+ if( patt.match(s) ):
+ return True
+ return False
+
+ def IsAFullLine(self,s):
+ #patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
+ patt = re.compile('^[A-Za-z0-9 .#(),_/-]+ +\\([0-9A-F]+, ?[0-9A-F]+\\) [A-Z][A-Z] [0-9]+$')
+ if( patt.match(s) ):
+ return True
+ print "Not full:", s
+ return False
+
+ def IsAComment(self,s):
+ if PdfTextParser.IsAComment(self,s):
+ return True
+ #patt = re.compile('^.*GE Medical Systems LightSpeed QX/i CONFORMANCE STATEMENT REV 2.2 sm 2288567-100.*$')
+ #if patt.match(s):
+ # return True
+ patt = re.compile('^.*GE Medical Systems HISPEED ADVANTAGE CT/i CONFORMANCE STATEMENT.*$')
+ if patt.match(s):
+ return True
+ patt = re.compile('^GE Medical Systems LightSpeed QX/i CONFORMANCE STATEMENT.*$')
+ if patt.match(s):
+ return True
+ patt = re.compile('^Attribute Name Tag VR VM$')
+ if patt.match(s):
+ return True
+ patt = re.compile('^B.[1-9].*Private .*$')
+ if patt.match(s):
+ return True
+ patt = re.compile('^Table B.1.? .* Private .*$')
+ if patt.match(s):
+ return True
+ patt = re.compile('^Note :.*$')
+ if patt.match(s):
+ return True
+ patt = re.compile('^7.11.1$')
+ if patt.match(s):
+ return True
+ return False
+
+ def AddOutputLine(self,s):
+ #print s
+ assert not self.IsAComment(s)
+ patt = re.compile('^([A-Za-z0-9 .#(),_/-]+) +\\(([0-9A-F]+), ?([0-9A-F]+)\\) ([A-Z][A-Z]) ([0-9]+)$')
+ m = patt.match(s)
+ if m:
+ ss = m.group(2).lower() + ' ' + m.group(3).lower() + ' ' + m.group(4) + ' ' + m.group(5) + ' ' + m.group(1)
+ self._OutLines.append(ss + '\n')
+ else:
+ print 'OOOPs', s
+