X-Git-Url: https://git.creatis.insa-lyon.fr/pubgit/?a=blobdiff_plain;f=Dicts%2FParseDict.py;h=2b3924245ab53897026a32b625fb2b949786f117;hb=97bf5414cdf6800715b7e595aed6c9c25bdde2f5;hp=86409d87c9c5543217b6aef8642037f001e3ce38;hpb=e022c5480089f9fbf2d96523749dc3664bb25d51;p=gdcm.git diff --git a/Dicts/ParseDict.py b/Dicts/ParseDict.py index 86409d87..2b392424 100644 --- a/Dicts/ParseDict.py +++ b/Dicts/ParseDict.py @@ -182,6 +182,10 @@ class UIDParser(PdfTextParser): return PdfTextParser.AddOutputLine(self,s) print "Discarding:", s + +""" +TransferSyntaxParser +""" class TransferSyntaxParser(UIDParser): def IsAFullLine(self,s): patt = re.compile('^(.*) Transfer Syntax PS ?[0-9].1?[0-9]$') @@ -190,7 +194,203 @@ class TransferSyntaxParser(UIDParser): print "Not a TS:", s return False - +""" +Papyrus parser +pdftotext -f 19 -l 41 -raw -nopgbrk /tmp/Papyrus31Specif.pdf /tmp/Papyrus31Specif.txt + +I need to do a second pass for pages: +#29 since I need to find [0-9.]+ +#40,41 since it start with number in two columns !! +""" +class PapyrusParser(PdfTextParser): + def __init__(self): + self._PreviousPage = 0 + self._PreviousNumber = 0 + PdfTextParser.__init__(self) + + def IsAStartingLine(self,s): + patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$') + if( patt.match(s) ): + return True + # After page 39, lines are like: + patt = re.compile('^[0-9x]+ [0-9xA-F]+ .*$') + if( patt.match(s) ): + #print "PAge 39", s + return True + return False + + def IsAFullLine(self,s): + patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$') + if( patt.match(s) ): + return True + # After page 39, lines are like: + patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9].*$') + if( patt.match(s) ): + #print "PAge 39", s + return True + return False + + def IsAComment(self,s): + # dummy case: + if s == 'Attribute Name Tag Type Attribute Description': + #print "Dummy", s + return True + patt = re.compile('^.*ANNEXE.*$') + if patt.match(s): + return True + # Indicate page #, spaces ending with only one number + # Sometime there is a line with only one number, we need to + # make sure that page # is strictly increasing + patt = re.compile('^[1-9][0-9]+$') + if( patt.match(s) ): + p = eval(s) + if( p > self._PreviousPage): + #print "Page #", p + self._PreviousNumber = 0 + self._PreviousPage = p + return True +# else: +# print "PAGE ERROR:", s + # Now within each page there is a comment that start with a # + # let's do the page approach wich reset at each page + patt = re.compile('^[0-9]+$') + if( patt.match(s) ): + if( eval(s) > self._PreviousNumber): + #print "Number #", eval(s) + self._PreviousNumber = eval(s) + return True + #else: + # print "ERROR:", s + return False + + def AddOutputLine(self,s): + assert not self.IsAComment(s) + s = s.replace('\n','') + #print "REMOVE return:", s + patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) ([0-9C]+) (.*)$') + m = patt.match(s) + ss = 'dummy (0000,0000) 0' + if m: + ss = m.group(2) + ' ' + m.group(3) + ' ' + m.group(1) + else: + patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) (.*)$') + m = patt.match(s) + if m: + ss = m.group(2) + ' 0 ' + m.group(1) + else: + ss = s + # There is two case one that end with all capital letter + # explaining the 'DEFINED TERMS' + patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9] [A-Z, ]$') + #patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9]|1\\-n [A-Z, |3.0]+$') + #patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [01n-] [A-Z, |3.0]+$') + if patt.match(s): + print "Match", s + ss = '' + self._OutLines.append(ss + '\n') + + def Open(self): + self._Infile = file(self._InputFilename, 'r') + for line in self._Infile.readlines(): + line = line[:-1] # remove '\n' + if not self.IsAComment( line ): + if self.IsAStartingLine(line): + #print "Previous buffer:",self._PreviousBuffers + previousbuffer = ' '.join(self._PreviousBuffers) + if self.IsAFullLine(previousbuffer): + self.AddOutputLine(previousbuffer) + else: + if previousbuffer: + print "Not a buffer:", previousbuffer + # We can clean buffer, since only the case 'suspicious' + + # 'Not a full line' has not added buffer to the list + self._PreviousBuffers = [] + # In all cases save the line for potentially growing this line + # just to be safe remove any white space at begining of string + assert not self.IsAComment(line) + self._PreviousBuffers.append(line.strip()) + else: + #print "Not a line",line + assert not self.IsAComment(line) + # just to be safe remove any white space at begining of string + self._PreviousBuffers.append(line.strip()) + else: + #print "Previous buffer:",self._PreviousBuffers + previousbuffer = ' '.join(self._PreviousBuffers) + if previousbuffer and self.IsAStartingLine(previousbuffer): + #print "This line is added:", previousbuffer + self.AddOutputLine( previousbuffer ) +# else: +# #print "Line is comment:", line +# print "Buffer is:", previousbuffer + # Ok this is a comment we can safely clean the buffer: + self._PreviousBuffers = [] + self.Write() + +""" +Parser for: +GE Medical Systems HISPEED ADVANTAGE CT/i CONFORMANCE STATEMENT +pdftotext -f 81 -l 90 -raw -nopgbrk 2162114_100r5.pdf 2162114_100r5.txt +""" +class GEMSParser(PdfTextParser): +# def __init__(self): +# PdfTextParser.__init__(self) + + def IsAStartingLine(self,s): + #patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$') + patt = re.compile('^[A-Za-z0-9 .#(),_/-]+ +\\([0-9A-F]+, ?[0-9A-F]+\\) +(.*)$') + if( patt.match(s) ): + return True + return False + + def IsAFullLine(self,s): + #patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$') + patt = re.compile('^[A-Za-z0-9 .#(),_/-]+ +\\([0-9A-F]+, ?[0-9A-F]+\\) [A-Z][A-Z] [0-9]+$') + if( patt.match(s) ): + return True + print "Not full:", s + return False + + def IsAComment(self,s): + if PdfTextParser.IsAComment(self,s): + return True + #patt = re.compile('^.*GE Medical Systems LightSpeed QX/i CONFORMANCE STATEMENT REV 2.2 sm 2288567-100.*$') + #if patt.match(s): + # return True + patt = re.compile('^.*GE Medical Systems HISPEED ADVANTAGE CT/i CONFORMANCE STATEMENT.*$') + if patt.match(s): + return True + patt = re.compile('^GE Medical Systems LightSpeed QX/i CONFORMANCE STATEMENT.*$') + if patt.match(s): + return True + patt = re.compile('^Attribute Name Tag VR VM$') + if patt.match(s): + return True + patt = re.compile('^B.[1-9].*Private .*$') + if patt.match(s): + return True + patt = re.compile('^Table B.1.? .* Private .*$') + if patt.match(s): + return True + patt = re.compile('^Note :.*$') + if patt.match(s): + return True + patt = re.compile('^7.11.1$') + if patt.match(s): + return True + return False + + def AddOutputLine(self,s): + #print s + assert not self.IsAComment(s) + patt = re.compile('^([A-Za-z0-9 .#(),_/-]+) +\\(([0-9A-F]+), ?([0-9A-F]+)\\) ([A-Z][A-Z]) ([0-9]+)$') + m = patt.match(s) + if m: + ss = m.group(2).lower() + ' ' + m.group(3).lower() + ' ' + m.group(4) + ' ' + m.group(5) + ' ' + m.group(1) + self._OutLines.append(ss + '\n') + else: + print 'OOOPs', s + """ This class is meant to expand line like: @@ -222,8 +422,12 @@ class DicomV3Expander: s2 = m.group(2) return s1.lower() + s2 else: - print "Impossible case:", s - os.sys.exit(1) + patt = re.compile('^[0-9a-fA-F]+ [0-9a-fA-F]+ [A-Z][A-Z] [0-9n-] .*$') + if patt.match(s): + return s + else: + print "Impossible case:", s + os.sys.exit(1) def AddOutputLine(self,s): if s.__class__ == list: @@ -240,7 +444,9 @@ class DicomV3Expander: list = [] if self.NeedToExpansion(s, list): self.AddOutputLine(list) # list != [] - elif self.NeedXXExpansion(s, list): + elif self.NeedGroupXXExpansion(s, list): + self.AddOutputLine(list) # list != [] + elif self.NeedElemXXExpansion(s, list): self.AddOutputLine(list) # list != [] else: self.AddOutputLine(self.LowerCaseTag(s)) @@ -264,7 +470,7 @@ class DicomV3Expander: # If line is like: # (50xx,1200) Number of Patient Related Studies IS 1 - def NeedXXExpansion(self,s,list): + def NeedGroupXXExpansion(self,s,list): patt = re.compile('^\\(([0-9a-fA-F]+)xx,([0-9a-fA-F]+)\\)(.*)$') m = patt.match(s) if m: @@ -283,6 +489,40 @@ class DicomV3Expander: return True return False + # If line is like: + # (2001,xx00) Number of Patient Related Studies IS 1 + def NeedElemXXExpansion(self,s,list): + patt = re.compile('^([0-9a-fA-F]+) ([0-9a-fA-F]+)xx(.*)$') + m = patt.match(s) + if m: + #print m.groups() + gr = m.group(1) + el_start = m.group(2) + start = '0x00' + end = '0xFF' + for i in range(eval(start), eval(end)): + el = '%02x'% i + l = '('+gr+','+el_start+el+')'+m.group(3) + print l + list.append(l) + return True + else: + patt = re.compile('^([0-9a-fA-F]+) xx([0-9a-fA-F]+)(.*)$') + m = patt.match(s) + if m: + #print m.groups() + gr = m.group(1) + el_start = m.group(2) + start = '0x00' + end = '0xFF' + for i in range(eval(start), eval(end)): + el = '%02x'% i + l = '('+gr+','+el+el_start+')'+m.group(3) + print l + list.append(l) + return True + return False + def Write(self): outfile = file(self._OutputFilename, 'w') outfile.writelines( self._OutLines ) @@ -296,6 +536,149 @@ class DicomV3Expander: self.Write() infile.close() +""" +Parse line from a philips document, line are like this: + +Syncra Scan Type 2005,10A1 VR = CS, VM = 1 +""" +class InteraParser: + def __init__(self): + self._InputFilename = '' + self._OutputFilename = '' + + def Reformat(self,s): + assert self.IsGood(s) + patt = re.compile("^([A-Za-z0-9 -]+) ([0-9A-Z]+),([0-9A-Z]+) VR = ([A-Z][A-Z]), VM = (.*)$") + m = patt.match(s) + if m: + dicom = m.group(2) + ' ' + m.group(3) + ' ' + m.group(4) + ' ' + m.group(5) + ' ' + m.group(1) + return dicom + else: + print "oops" + + def IsGood(self,s): + patt = re.compile("^[A-Za-z0-9 -]+ [0-9A-Z]+,[0-9A-Z]+ VR = [A-Z][A-Z], VM = .*$") + if patt.match(s): + return True + print "Not good:", s + return False + + def SetInputFileName(self,s): + self._InputFilename = s + + def SetOutputFileName(self,s): + self._OutputFilename = s + + def Parse(self): + infile = file(self._InputFilename, 'r') + outLines = [] + for line in infile.readlines(): + print self.Reformat(line) + outLines.append( self.Reformat(line) + '\n' ) + outfile = file(self._OutputFilename, 'w') + outfile.writelines( outLines ) + outfile.close() + +""" +Parse line from a dicom3tools document, line are like this: + +(0003,0008) VERS="SSPI" VR="US" VM="1" Owner="SIEMENS ISI" Keyword="ISICommandField" Name="ISI Command Field" +""" +class Dicom3ToolsParser: + def __init__(self): + self._InputFilename = '' + self._OutputFilename = '' + + def Reformat(self,s): + assert self.IsGood(s) + patt = re.compile("^\(([0-9a-f]+),([0-9a-f]+)\)\s+VERS=\".*\"\s+VR=\"([A-Z][A-Z])\"\s+VM=\"(.*)\"\s+Owner=\".*\"\s+Keyword=\".*\"\s+Name=\"(.*)\"$") + m = patt.match(s) + dicom = '' + if m: + # Apparently some have Name == '?', skip those + name = m.group(5) + if name != '?' and name != '? ': + dicom = m.group(1) + ' ' + m.group(2) + ' ' + m.group(3) + ' ' + m.group(4) + ' ' + m.group(5) + else: + print "oops" + else: + print "oops" + return dicom + + def IsGood(self,s): + #patt = re.compile("^\([0-9a-f]+,[0-9a-f]+\) VERS=\".*\" VR=\"[A-Z][A-Z]\" VM=\".*\" Owner=\".*\" Keyword=\".*\" Name=\".*\"$") + patt = re.compile("^\([0-9a-f]+,[0-9a-f]+\)\s+VERS=\".*\"\s+VR=\"[A-Z][A-Z]\"\s+VM=\".*\"\s+Owner=\".*\"\s+Keyword=\".*\"\s+Name=\".*\".*$") + if patt.match(s): + return True + print "Not good:", s + return False + + def SetInputFileName(self,s): + self._InputFilename = s + + def SetOutputFileName(self,s): + self._OutputFilename = s + + def Parse(self): + infile = file(self._InputFilename, 'r') + outLines = [] + for line in infile.readlines(): + newline = self.Reformat(line) + print newline + if newline: + outLines.append( newline + '\n' ) + outfile = file(self._OutputFilename, 'w') + outfile.writelines( outLines ) + outfile.close() + +""" +Parse line from a PhilipsAdvance document, line are like this: + +GE Advance Implementation Version Name (0009,1001) 3 LO 2 n/a +""" +class GEAdvanceParser: + def __init__(self): + self._InputFilename = '' + self._OutputFilename = '' + + def Reformat(self,s): + assert self.IsGood(s) + #patt = re.compile("^\(([0-9a-f]+),([0-9a-f]+)\)\s+VERS=\".*\"\s+VR=\"([A-Z][A-Z])\"\s+VM=\"(.*)\"\s+Owner=\".*\"\s+Keyword=\".*\"\s+Name=\"(.*)\"$") + patt = re.compile("^([A-Za-z0-9 ._>]+) \\(([0-9A-F]+),([0-9A-F]+)\\) [0-9] ([A-Z][A-Z]) ([0-9]) .*$") + m = patt.match(s) + dicom = '' + if m: + dicom = m.group(2) + ' ' + m.group(3).lower() + ' ' + m.group(4) + ' ' + m.group(5) + ' ' + m.group(1) + else: + print "oops" + return dicom + + def IsGood(self,s): + #patt = re.compile("^\([0-9a-f]+,[0-9a-f]+\)\s+VERS=\".*\"\s+VR=\"[A-Z][A-Z]\"\s+VM=\".*\"\s+Owner=\".*\"\s+Keyword=\".*\"\s+Name=\".*\".*$") + patt = re.compile("^[A-Za-z0-9 ._>]+ \\([0-9A-F]+,[0-9A-F]+\\) [0-9] [A-Z][A-Z] [0-9] .*$") + if patt.match(s): + return True + print "Not good:", s + return False + + def SetInputFileName(self,s): + self._InputFilename = s + + def SetOutputFileName(self,s): + self._OutputFilename = s + + def Parse(self): + infile = file(self._InputFilename, 'r') + outLines = [] + for line in infile.readlines(): + newline = self.Reformat(line) + #print newline + if newline: + outLines.append( newline + '\n' ) + outfile = file(self._OutputFilename, 'w') + outfile.writelines( outLines ) + outfile.close() + if __name__ == "__main__": argc = len(os.sys.argv ) if ( argc < 3 ): @@ -311,15 +694,42 @@ if __name__ == "__main__": #dp.SetOutputFileName( outputfilename ) dp.SetOutputFileName( tempfile ) dp.Parse() - exp = DicomV3Expander() - exp.SetInputFileName( tempfile ) + #exp.SetInputFileName( tempfile ) + exp.SetInputFileName( inputfilename ) exp.SetOutputFileName( outputfilename ) exp.Expand() - """ + dp = TransferSyntaxParser() dp.SetInputFileName( inputfilename ) dp.SetOutputFileName( outputfilename ) dp.Parse() + dp = PapyrusParser() + dp.SetInputFileName( inputfilename ) + dp.SetOutputFileName( outputfilename ) + dp.Parse() + + dp = InteraParser() + dp.SetInputFileName( inputfilename ) + dp.SetOutputFileName( outputfilename ) + dp.Parse() + dp = GEMSParser() + dp.SetInputFileName( inputfilename ) + dp.SetOutputFileName( outputfilename ) + dp.Parse() + + """ + dp = Dicom3ToolsParser() + dp.SetInputFileName( inputfilename ) + dp.SetOutputFileName( outputfilename ) + dp.Parse() + + """ + dp = GEAdvanceParser() + dp.SetInputFileName( inputfilename ) + dp.SetOutputFileName( outputfilename ) + dp.Parse() + """ + #print dp.IsAStartingLine( "(0004,1212) File-set Consistency Flag US 1\n" )