First commit

gatesburg · Apr 9, 2013 · 3e96427 · 3e96427
commit 3e96427
Show file tree

Hide file tree

Showing 5 changed files with 433 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/cobol.py b/cobol.py
@@ -0,0 +1,320 @@
+import re
+
+class CobolPatterns:
+    opt_pattern_format = "({})?"
+
+    row_pattern_base = r'^(?P<level>\d{2})\s+(?P<name>\S+)'
+    row_pattern_occurs = r'\s+OCCURS (?P<occurs>\d+) TIMES'
+    row_pattern_indexed_by = r"\s+INDEXED BY\s(?P<indexed_by>\S+)"
+    row_pattern_redefines = r"\s+REDEFINES\s(?P<redefines>\S+)"
+    row_pattern_pic = r'\s+PIC\s+(?P<pic>\S+)'
+    row_pattern_end = r'\.$'
+
+    row_pattern = re.compile(row_pattern_base + 
+                             opt_pattern_format.format(row_pattern_redefines) + 
+                             opt_pattern_format.format(row_pattern_occurs) + 
+                             opt_pattern_format.format(row_pattern_indexed_by) + 
+                             opt_pattern_format.format(row_pattern_pic) + 
+                             row_pattern_end)
+
+    pic_pattern_repeats = re.compile(r'(.)\((\d+)\)')
+    pic_pattern_float = re.compile(r'S?[9Z]*[.V][9Z]+')
+    pic_pattern_integer = re.compile(r'S?[9Z]+')
+
+
+# Parse the pic string
+def parse_pic_string(pic_str):
+    # Expand repeating chars
+    while True:
+        match = CobolPatterns.pic_pattern_repeats.search(pic_str)
+
+        if not match:
+            break
+
+        expanded_str = match.group(1) * int(match.group(2))
+
+        pic_str = CobolPatterns.pic_pattern_repeats.sub(expanded_str, pic_str, 1)
+
+    # Match to types
+    if CobolPatterns.pic_pattern_float.match(pic_str):
+        data_type = 'Float'
+    elif CobolPatterns.pic_pattern_integer.match(pic_str):
+        data_type = 'Integer'
+    else:
+        data_type = 'Char'
+
+    # Handle signed
+    if pic_str[0] == "S":
+        data_type = "Signed " + data_type
+        pic_str = pic_str[1:]
+
+    # Handle precision
+    decimal_pos = 0
+
+    if 'V' in pic_str:
+        decimal_pos = len(pic_str[pic_str.index('V') + 1 :])
+        pic_str = pic_str.replace('V', '')
+
+    return {'type':data_type, 'length':len(pic_str), 'precision':decimal_pos}
+
+# Cleans the COBOL by converting the cobol informaton to single lines
+def clean_cobol(lines):
+    holder = []
+
+    output = []
+
+    for row in lines:            
+        row = row[6:72].rstrip()
+
+        if row == "" or row[0] in ('*','/'):
+            continue
+
+        holder.append(row if len(holder) == 0 else row.strip())
+
+        if row[-1] == ".":
+            output.append(" ".join(holder))
+
+            holder = []
+
+
+    if len(holder) > 0:
+        print "[WARNING] probably invalid COBOL - found unfinished line: ", " ".join(holder)
+
+    return output
+
+"""
+Parses the COBOL
+ - converts the COBOL line into a dictionarty containing the information
+ - parses the pic information into type, length, precision 
+ - handles redefines
+"""
+def parse_cobol(lines):
+    output = []
+
+    intify = ["level","occurs"]
+
+    # All in 1 line now, let's parse
+    for row in lines:
+        match = CobolPatterns.row_pattern.match(row.strip())
+
+        if not match:
+            print "Found unmatched row", row.strip()
+            continue
+
+        match = match.groupdict()
+
+        for i in intify:
+            match[i] = int(match[i] ) if match[i] is not None else None
+
+        if match['pic'] is not None:
+            match['pic_info'] = parse_pic_string(match['pic'])
+
+        if match['redefines'] is not None:
+            # Find item that is being redefined.
+            try:
+                redefinedItemIndex, redefinedItem = [(index, item) for index, item in enumerate(output) if item['name'] == match['redefines']][0]
+
+                related_group = get_subgroup( redefinedItem['level'] , output[ redefinedItemIndex+1 : ] )
+
+                output = output[:redefinedItemIndex] + output[ redefinedItemIndex + len(related_group) + 1 : ]
+
+                match['redefines'] = None
+            except IndexError:
+                print "Could not find the field to be redefined ({}) for row: {}".format(match['redefines'], row.strip())
+
+        output.append(match)
+
+    return output
+
+# Helper function
+# Gets all the lines that have a higher level then the parent_level until
+# a line with equal or lower level then parent_level is encountered
+def get_subgroup(parent_level, lines):
+    output = []
+
+    for row in lines:
+        if row["level"] > parent_level:
+            output.append(row)
+        else:
+            return output
+
+    return output
+
+def denormalize_cobol(lines):
+    return handle_occurs(lines, 1)
+
+# Helper function
+# Will go ahead and denormalize the COBOL
+# Beacuse the OCCURS are removed the INDEXED BY will also be removed
+def handle_occurs(lines, occurs, level_diff=0, name_postfix=""):
+    output = []
+
+    for i in range(1, occurs+1):
+
+        skipTill = 0
+        new_name_postfix = name_postfix if occurs == 1 else name_postfix + '-' + str(i)
+
+        for index, row in enumerate(lines):
+            if index < skipTill:
+                continue
+
+            new_row = row.copy()
+
+            new_row['level'] += level_diff
+
+            # Not needed when flattened
+            new_row['indexed_by'] = None
+
+            if row['occurs'] is None:
+                # First time occurs is just 1, we don't want to add _1 after *every* field
+                new_row['name'] = row['name'] + new_name_postfix
+                # + "-" + str(i) if occurs > 1 else row['name'] + name_postfix
+
+                output.append(new_row)
+
+            else:
+                if row["pic"] is not None:
+                    # If it has occurs and pic just repeat the same line multiple times
+                    new_row['occurs'] = None
+
+                    for j in range(1, row["occurs"]+1):
+                        row_to_add = new_row.copy()
+
+                        # First time occurs is just 1, we don't want to add _1 after *every* field
+                        row_to_add["name"] = row['name'] + new_name_postfix + '-' + str(j)
+                        # + "-" + str(i) + "-" + str(j) if occurs > 1 else row['name'] + name_postfix + "-" + str(j) 
+
+                        output.append(row_to_add)
+
+                else:
+                    # Get all the lines that have to occur
+                    occur_lines = get_subgroup(row['level'], lines[index+1:])
+
+                    # Calculate the new level difference that has to be applied
+                    new_level_diff = level_diff + row['level'] - occur_lines[0]['level']
+
+                    output += handle_occurs(occur_lines, row['occurs'], new_level_diff, new_name_postfix)
+
+                    skipTill = index + len(occur_lines) + 1
+
+    return output
+
+"""
+Clean the names.
+
+Options to:
+ - strip prefixes on names
+ - enforce unique names
+ - make database safe names by converting - to _
+"""
+def clean_names(lines, ensure_unique_names=False, strip_prefix=False, make_database_safe=False):
+    names = {}
+
+    for row in lines:
+        if strip_prefix:
+            row['name'] = row['name'][ row['name'].find('-')+1 : ]
+
+            if row['indexed_by'] is not None:
+                row['indexed_by'] = row['indexed_by'][ row['indexed_by'].find('-')+1 : ]
+
+        if ensure_unique_names:
+            i = 1
+            while (row['name'] if i == 1 else row['name'] + "-" + str(i)) in names:
+                i += 1
+
+            names[row['name'] if i == 1 else row['name'] + "-" + str(i)] = 1
+
+            if i > 1:
+                row['name'] = row['name'] + "-" + str(i)
+
+        if make_database_safe:
+            row['name'] = row['name'].replace("-","_")
+
+
+    return lines
+
+def process_cobol(lines):
+    return clean_names(denormalize_cobol(parse_cobol(clean_cobol(lines))), True, True, True)    
+
+# Prints a Copybook compatible file
+def print_cobol(lines):
+    output = []
+
+    default_padding = ' ' * 7
+
+    levels = [0]
+
+    for row in lines:
+        row_output = []
+
+        if row['level'] > levels[-1]:
+            levels.append(row['level'])
+        else:
+            while row['level'] < levels[-1]:
+                levels.pop()
+
+        row_output.append( (len(levels)-1) * '  ' )
+        row_output.append( "{0:02d}  ".format(row['level']) )
+        row_output.append( row['name'])
+
+        if row['indexed_by'] is not None:
+            row_output.append(" INDEXED BY "+row['indexed_by'])
+
+        if row['occurs'] is not None:
+            row_output.append( " OCCURS {0:04d} TIMES".format(row['occurs']) )
+
+        if row['pic'] is not None:
+            row_output.append( " PIC " + row['pic'] )
+
+        row_output.append(".")
+
+        tot_length = 0
+        max_data_length = 66
+        outp = default_padding
+
+        for data in row_output:
+
+            if len(outp) + len(data) + 1 > max_data_length:
+                # Makes rows 80 chars
+                outp += (80-len(outp)) * ' '
+
+                output.append(outp)
+
+                # Start the following line with an extra padding
+                outp = default_padding + (len(levels)-1) * '  ' + '    '
+
+            outp += data
+
+        outp += (80-len(outp)) * ' '
+        output.append(outp)
+
+    print "\n".join(output)
+
+if __name__ == '__main__':
+    import argparse
+    import os.path
+
+    parser = argparse.ArgumentParser(description="Parse COBOL Copybooks")
+    parser.add_argument("filename", help="The filename of the copybook.")
+    parser.add_argument("--skip-all-processing", help="Only processes the redefines.", default=False, action="store_true")
+    parser.add_argument("--skip-unique-names", help="Skips making all names unique.", default=False, action="store_true")
+    parser.add_argument("--skip-denormalize", help="Skips denormalizing the COBOL.", default=False, action="store_true")
+    parser.add_argument("--skip-strip-prefix", help="Skips stripping the prefix from the names.", default=False, action="store_true")
+
+    args = parser.parse_args() 
+
+    if not os.path.isfile(args.filename):
+        print "Could not find", args.filename
+        exit()
+
+    with open(args.filename,'r') as f:
+        lines = parse_cobol(clean_cobol(f.readlines()))
+
+        if not args.skip_all_processing:
+            if not args.skip_denormalize:
+                lines = denormalize_cobol(lines)
+
+            if not args.skip_strip_prefix or not args.skip_unique_names:
+                lines = clean_names(lines, not args.skip_unique_names, not args.skip_strip_prefix)
+
+        print_cobol(lines)
diff --git a/example.cbl b/example.cbl
@@ -0,0 +1,9 @@
+00000 * Example COBOL Copybook file                                     AAAAAAAA
+00000  01  PAULUS-EXAMPLE-GROUP.                                        AAAAAAAA
+00000       05  PAULUS-ANOTHER-GROUP OCCURS 0003 TIMES.                 AAAAAAAA
+00000           10  PAULUS-FIELD-1 PIC X(3).                            AAAAAAAA
+00000           10  PAULUS-FIELD-2 REDEFINES PAULUS-FIELD-1 PIC 9(3).   AAAAAAAA
+00000           10  PAULUS-FIELD-3 OCCURS 0002 TIMES                    AAAAAAAA
+00000                           PIC S9(3)V99.                           AAAAAAAA
+00000       05  PAULUS-THIS-IS-ANOTHER-GROUP.                           AAAAAAAA
+00000           10  PAULUS-YES PIC X(5).                                AAAAAAAA
diff --git a/example.py b/example.py
@@ -0,0 +1,5 @@
+import cobol
+
+with open("example.cbl",'r') as f:
+    for row in cobol.process_cobol(f.readlines()):
+    	print row['name']