forked from royopa/python-cobol
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 3e96427
Showing
5 changed files
with
433 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,320 @@ | ||
import re | ||
|
||
class CobolPatterns: | ||
opt_pattern_format = "({})?" | ||
|
||
row_pattern_base = r'^(?P<level>\d{2})\s+(?P<name>\S+)' | ||
row_pattern_occurs = r'\s+OCCURS (?P<occurs>\d+) TIMES' | ||
row_pattern_indexed_by = r"\s+INDEXED BY\s(?P<indexed_by>\S+)" | ||
row_pattern_redefines = r"\s+REDEFINES\s(?P<redefines>\S+)" | ||
row_pattern_pic = r'\s+PIC\s+(?P<pic>\S+)' | ||
row_pattern_end = r'\.$' | ||
|
||
row_pattern = re.compile(row_pattern_base + | ||
opt_pattern_format.format(row_pattern_redefines) + | ||
opt_pattern_format.format(row_pattern_occurs) + | ||
opt_pattern_format.format(row_pattern_indexed_by) + | ||
opt_pattern_format.format(row_pattern_pic) + | ||
row_pattern_end) | ||
|
||
pic_pattern_repeats = re.compile(r'(.)\((\d+)\)') | ||
pic_pattern_float = re.compile(r'S?[9Z]*[.V][9Z]+') | ||
pic_pattern_integer = re.compile(r'S?[9Z]+') | ||
|
||
|
||
# Parse the pic string | ||
def parse_pic_string(pic_str): | ||
# Expand repeating chars | ||
while True: | ||
match = CobolPatterns.pic_pattern_repeats.search(pic_str) | ||
|
||
if not match: | ||
break | ||
|
||
expanded_str = match.group(1) * int(match.group(2)) | ||
|
||
pic_str = CobolPatterns.pic_pattern_repeats.sub(expanded_str, pic_str, 1) | ||
|
||
# Match to types | ||
if CobolPatterns.pic_pattern_float.match(pic_str): | ||
data_type = 'Float' | ||
elif CobolPatterns.pic_pattern_integer.match(pic_str): | ||
data_type = 'Integer' | ||
else: | ||
data_type = 'Char' | ||
|
||
# Handle signed | ||
if pic_str[0] == "S": | ||
data_type = "Signed " + data_type | ||
pic_str = pic_str[1:] | ||
|
||
# Handle precision | ||
decimal_pos = 0 | ||
|
||
if 'V' in pic_str: | ||
decimal_pos = len(pic_str[pic_str.index('V') + 1 :]) | ||
pic_str = pic_str.replace('V', '') | ||
|
||
return {'type':data_type, 'length':len(pic_str), 'precision':decimal_pos} | ||
|
||
# Cleans the COBOL by converting the cobol informaton to single lines | ||
def clean_cobol(lines): | ||
holder = [] | ||
|
||
output = [] | ||
|
||
for row in lines: | ||
row = row[6:72].rstrip() | ||
|
||
if row == "" or row[0] in ('*','/'): | ||
continue | ||
|
||
holder.append(row if len(holder) == 0 else row.strip()) | ||
|
||
if row[-1] == ".": | ||
output.append(" ".join(holder)) | ||
|
||
holder = [] | ||
|
||
|
||
if len(holder) > 0: | ||
print "[WARNING] probably invalid COBOL - found unfinished line: ", " ".join(holder) | ||
|
||
return output | ||
|
||
""" | ||
Parses the COBOL | ||
- converts the COBOL line into a dictionarty containing the information | ||
- parses the pic information into type, length, precision | ||
- handles redefines | ||
""" | ||
def parse_cobol(lines): | ||
output = [] | ||
|
||
intify = ["level","occurs"] | ||
|
||
# All in 1 line now, let's parse | ||
for row in lines: | ||
match = CobolPatterns.row_pattern.match(row.strip()) | ||
|
||
if not match: | ||
print "Found unmatched row", row.strip() | ||
continue | ||
|
||
match = match.groupdict() | ||
|
||
for i in intify: | ||
match[i] = int(match[i] ) if match[i] is not None else None | ||
|
||
if match['pic'] is not None: | ||
match['pic_info'] = parse_pic_string(match['pic']) | ||
|
||
if match['redefines'] is not None: | ||
# Find item that is being redefined. | ||
try: | ||
redefinedItemIndex, redefinedItem = [(index, item) for index, item in enumerate(output) if item['name'] == match['redefines']][0] | ||
|
||
related_group = get_subgroup( redefinedItem['level'] , output[ redefinedItemIndex+1 : ] ) | ||
|
||
output = output[:redefinedItemIndex] + output[ redefinedItemIndex + len(related_group) + 1 : ] | ||
|
||
match['redefines'] = None | ||
except IndexError: | ||
print "Could not find the field to be redefined ({}) for row: {}".format(match['redefines'], row.strip()) | ||
|
||
output.append(match) | ||
|
||
return output | ||
|
||
# Helper function | ||
# Gets all the lines that have a higher level then the parent_level until | ||
# a line with equal or lower level then parent_level is encountered | ||
def get_subgroup(parent_level, lines): | ||
output = [] | ||
|
||
for row in lines: | ||
if row["level"] > parent_level: | ||
output.append(row) | ||
else: | ||
return output | ||
|
||
return output | ||
|
||
def denormalize_cobol(lines): | ||
return handle_occurs(lines, 1) | ||
|
||
# Helper function | ||
# Will go ahead and denormalize the COBOL | ||
# Beacuse the OCCURS are removed the INDEXED BY will also be removed | ||
def handle_occurs(lines, occurs, level_diff=0, name_postfix=""): | ||
output = [] | ||
|
||
for i in range(1, occurs+1): | ||
|
||
skipTill = 0 | ||
new_name_postfix = name_postfix if occurs == 1 else name_postfix + '-' + str(i) | ||
|
||
for index, row in enumerate(lines): | ||
if index < skipTill: | ||
continue | ||
|
||
new_row = row.copy() | ||
|
||
new_row['level'] += level_diff | ||
|
||
# Not needed when flattened | ||
new_row['indexed_by'] = None | ||
|
||
if row['occurs'] is None: | ||
# First time occurs is just 1, we don't want to add _1 after *every* field | ||
new_row['name'] = row['name'] + new_name_postfix | ||
# + "-" + str(i) if occurs > 1 else row['name'] + name_postfix | ||
|
||
output.append(new_row) | ||
|
||
else: | ||
if row["pic"] is not None: | ||
# If it has occurs and pic just repeat the same line multiple times | ||
new_row['occurs'] = None | ||
|
||
for j in range(1, row["occurs"]+1): | ||
row_to_add = new_row.copy() | ||
|
||
# First time occurs is just 1, we don't want to add _1 after *every* field | ||
row_to_add["name"] = row['name'] + new_name_postfix + '-' + str(j) | ||
# + "-" + str(i) + "-" + str(j) if occurs > 1 else row['name'] + name_postfix + "-" + str(j) | ||
|
||
output.append(row_to_add) | ||
|
||
else: | ||
# Get all the lines that have to occur | ||
occur_lines = get_subgroup(row['level'], lines[index+1:]) | ||
|
||
# Calculate the new level difference that has to be applied | ||
new_level_diff = level_diff + row['level'] - occur_lines[0]['level'] | ||
|
||
output += handle_occurs(occur_lines, row['occurs'], new_level_diff, new_name_postfix) | ||
|
||
skipTill = index + len(occur_lines) + 1 | ||
|
||
return output | ||
|
||
""" | ||
Clean the names. | ||
Options to: | ||
- strip prefixes on names | ||
- enforce unique names | ||
- make database safe names by converting - to _ | ||
""" | ||
def clean_names(lines, ensure_unique_names=False, strip_prefix=False, make_database_safe=False): | ||
names = {} | ||
|
||
for row in lines: | ||
if strip_prefix: | ||
row['name'] = row['name'][ row['name'].find('-')+1 : ] | ||
|
||
if row['indexed_by'] is not None: | ||
row['indexed_by'] = row['indexed_by'][ row['indexed_by'].find('-')+1 : ] | ||
|
||
if ensure_unique_names: | ||
i = 1 | ||
while (row['name'] if i == 1 else row['name'] + "-" + str(i)) in names: | ||
i += 1 | ||
|
||
names[row['name'] if i == 1 else row['name'] + "-" + str(i)] = 1 | ||
|
||
if i > 1: | ||
row['name'] = row['name'] + "-" + str(i) | ||
|
||
if make_database_safe: | ||
row['name'] = row['name'].replace("-","_") | ||
|
||
|
||
return lines | ||
|
||
def process_cobol(lines): | ||
return clean_names(denormalize_cobol(parse_cobol(clean_cobol(lines))), True, True, True) | ||
|
||
# Prints a Copybook compatible file | ||
def print_cobol(lines): | ||
output = [] | ||
|
||
default_padding = ' ' * 7 | ||
|
||
levels = [0] | ||
|
||
for row in lines: | ||
row_output = [] | ||
|
||
if row['level'] > levels[-1]: | ||
levels.append(row['level']) | ||
else: | ||
while row['level'] < levels[-1]: | ||
levels.pop() | ||
|
||
row_output.append( (len(levels)-1) * ' ' ) | ||
row_output.append( "{0:02d} ".format(row['level']) ) | ||
row_output.append( row['name']) | ||
|
||
if row['indexed_by'] is not None: | ||
row_output.append(" INDEXED BY "+row['indexed_by']) | ||
|
||
if row['occurs'] is not None: | ||
row_output.append( " OCCURS {0:04d} TIMES".format(row['occurs']) ) | ||
|
||
if row['pic'] is not None: | ||
row_output.append( " PIC " + row['pic'] ) | ||
|
||
row_output.append(".") | ||
|
||
tot_length = 0 | ||
max_data_length = 66 | ||
outp = default_padding | ||
|
||
for data in row_output: | ||
|
||
if len(outp) + len(data) + 1 > max_data_length: | ||
# Makes rows 80 chars | ||
outp += (80-len(outp)) * ' ' | ||
|
||
output.append(outp) | ||
|
||
# Start the following line with an extra padding | ||
outp = default_padding + (len(levels)-1) * ' ' + ' ' | ||
|
||
outp += data | ||
|
||
outp += (80-len(outp)) * ' ' | ||
output.append(outp) | ||
|
||
print "\n".join(output) | ||
|
||
if __name__ == '__main__': | ||
import argparse | ||
import os.path | ||
|
||
parser = argparse.ArgumentParser(description="Parse COBOL Copybooks") | ||
parser.add_argument("filename", help="The filename of the copybook.") | ||
parser.add_argument("--skip-all-processing", help="Only processes the redefines.", default=False, action="store_true") | ||
parser.add_argument("--skip-unique-names", help="Skips making all names unique.", default=False, action="store_true") | ||
parser.add_argument("--skip-denormalize", help="Skips denormalizing the COBOL.", default=False, action="store_true") | ||
parser.add_argument("--skip-strip-prefix", help="Skips stripping the prefix from the names.", default=False, action="store_true") | ||
|
||
args = parser.parse_args() | ||
|
||
if not os.path.isfile(args.filename): | ||
print "Could not find", args.filename | ||
exit() | ||
|
||
with open(args.filename,'r') as f: | ||
lines = parse_cobol(clean_cobol(f.readlines())) | ||
|
||
if not args.skip_all_processing: | ||
if not args.skip_denormalize: | ||
lines = denormalize_cobol(lines) | ||
|
||
if not args.skip_strip_prefix or not args.skip_unique_names: | ||
lines = clean_names(lines, not args.skip_unique_names, not args.skip_strip_prefix) | ||
|
||
print_cobol(lines) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
00000 * Example COBOL Copybook file AAAAAAAA | ||
00000 01 PAULUS-EXAMPLE-GROUP. AAAAAAAA | ||
00000 05 PAULUS-ANOTHER-GROUP OCCURS 0003 TIMES. AAAAAAAA | ||
00000 10 PAULUS-FIELD-1 PIC X(3). AAAAAAAA | ||
00000 10 PAULUS-FIELD-2 REDEFINES PAULUS-FIELD-1 PIC 9(3). AAAAAAAA | ||
00000 10 PAULUS-FIELD-3 OCCURS 0002 TIMES AAAAAAAA | ||
00000 PIC S9(3)V99. AAAAAAAA | ||
00000 05 PAULUS-THIS-IS-ANOTHER-GROUP. AAAAAAAA | ||
00000 10 PAULUS-YES PIC X(5). AAAAAAAA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import cobol | ||
|
||
with open("example.cbl",'r') as f: | ||
for row in cobol.process_cobol(f.readlines()): | ||
print row['name'] |
Oops, something went wrong.