Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
balloob committed Apr 9, 2013
0 parents commit 3e96427
Show file tree
Hide file tree
Showing 5 changed files with 433 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
320 changes: 320 additions & 0 deletions cobol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,320 @@
import re

class CobolPatterns:
opt_pattern_format = "({})?"

row_pattern_base = r'^(?P<level>\d{2})\s+(?P<name>\S+)'
row_pattern_occurs = r'\s+OCCURS (?P<occurs>\d+) TIMES'
row_pattern_indexed_by = r"\s+INDEXED BY\s(?P<indexed_by>\S+)"
row_pattern_redefines = r"\s+REDEFINES\s(?P<redefines>\S+)"
row_pattern_pic = r'\s+PIC\s+(?P<pic>\S+)'
row_pattern_end = r'\.$'

row_pattern = re.compile(row_pattern_base +
opt_pattern_format.format(row_pattern_redefines) +
opt_pattern_format.format(row_pattern_occurs) +
opt_pattern_format.format(row_pattern_indexed_by) +
opt_pattern_format.format(row_pattern_pic) +
row_pattern_end)

pic_pattern_repeats = re.compile(r'(.)\((\d+)\)')
pic_pattern_float = re.compile(r'S?[9Z]*[.V][9Z]+')
pic_pattern_integer = re.compile(r'S?[9Z]+')


# Parse the pic string
def parse_pic_string(pic_str):
# Expand repeating chars
while True:
match = CobolPatterns.pic_pattern_repeats.search(pic_str)

if not match:
break

expanded_str = match.group(1) * int(match.group(2))

pic_str = CobolPatterns.pic_pattern_repeats.sub(expanded_str, pic_str, 1)

# Match to types
if CobolPatterns.pic_pattern_float.match(pic_str):
data_type = 'Float'
elif CobolPatterns.pic_pattern_integer.match(pic_str):
data_type = 'Integer'
else:
data_type = 'Char'

# Handle signed
if pic_str[0] == "S":
data_type = "Signed " + data_type
pic_str = pic_str[1:]

# Handle precision
decimal_pos = 0

if 'V' in pic_str:
decimal_pos = len(pic_str[pic_str.index('V') + 1 :])
pic_str = pic_str.replace('V', '')

return {'type':data_type, 'length':len(pic_str), 'precision':decimal_pos}

# Cleans the COBOL by converting the cobol informaton to single lines
def clean_cobol(lines):
holder = []

output = []

for row in lines:
row = row[6:72].rstrip()

if row == "" or row[0] in ('*','/'):
continue

holder.append(row if len(holder) == 0 else row.strip())

if row[-1] == ".":
output.append(" ".join(holder))

holder = []


if len(holder) > 0:
print "[WARNING] probably invalid COBOL - found unfinished line: ", " ".join(holder)

return output

"""
Parses the COBOL
- converts the COBOL line into a dictionarty containing the information
- parses the pic information into type, length, precision
- handles redefines
"""
def parse_cobol(lines):
output = []

intify = ["level","occurs"]

# All in 1 line now, let's parse
for row in lines:
match = CobolPatterns.row_pattern.match(row.strip())

if not match:
print "Found unmatched row", row.strip()
continue

match = match.groupdict()

for i in intify:
match[i] = int(match[i] ) if match[i] is not None else None

if match['pic'] is not None:
match['pic_info'] = parse_pic_string(match['pic'])

if match['redefines'] is not None:
# Find item that is being redefined.
try:
redefinedItemIndex, redefinedItem = [(index, item) for index, item in enumerate(output) if item['name'] == match['redefines']][0]

related_group = get_subgroup( redefinedItem['level'] , output[ redefinedItemIndex+1 : ] )

output = output[:redefinedItemIndex] + output[ redefinedItemIndex + len(related_group) + 1 : ]

match['redefines'] = None
except IndexError:
print "Could not find the field to be redefined ({}) for row: {}".format(match['redefines'], row.strip())

output.append(match)

return output

# Helper function
# Gets all the lines that have a higher level then the parent_level until
# a line with equal or lower level then parent_level is encountered
def get_subgroup(parent_level, lines):
output = []

for row in lines:
if row["level"] > parent_level:
output.append(row)
else:
return output

return output

def denormalize_cobol(lines):
return handle_occurs(lines, 1)

# Helper function
# Will go ahead and denormalize the COBOL
# Beacuse the OCCURS are removed the INDEXED BY will also be removed
def handle_occurs(lines, occurs, level_diff=0, name_postfix=""):
output = []

for i in range(1, occurs+1):

skipTill = 0
new_name_postfix = name_postfix if occurs == 1 else name_postfix + '-' + str(i)

for index, row in enumerate(lines):
if index < skipTill:
continue

new_row = row.copy()

new_row['level'] += level_diff

# Not needed when flattened
new_row['indexed_by'] = None

if row['occurs'] is None:
# First time occurs is just 1, we don't want to add _1 after *every* field
new_row['name'] = row['name'] + new_name_postfix
# + "-" + str(i) if occurs > 1 else row['name'] + name_postfix

output.append(new_row)

else:
if row["pic"] is not None:
# If it has occurs and pic just repeat the same line multiple times
new_row['occurs'] = None

for j in range(1, row["occurs"]+1):
row_to_add = new_row.copy()

# First time occurs is just 1, we don't want to add _1 after *every* field
row_to_add["name"] = row['name'] + new_name_postfix + '-' + str(j)
# + "-" + str(i) + "-" + str(j) if occurs > 1 else row['name'] + name_postfix + "-" + str(j)

output.append(row_to_add)

else:
# Get all the lines that have to occur
occur_lines = get_subgroup(row['level'], lines[index+1:])

# Calculate the new level difference that has to be applied
new_level_diff = level_diff + row['level'] - occur_lines[0]['level']

output += handle_occurs(occur_lines, row['occurs'], new_level_diff, new_name_postfix)

skipTill = index + len(occur_lines) + 1

return output

"""
Clean the names.
Options to:
- strip prefixes on names
- enforce unique names
- make database safe names by converting - to _
"""
def clean_names(lines, ensure_unique_names=False, strip_prefix=False, make_database_safe=False):
names = {}

for row in lines:
if strip_prefix:
row['name'] = row['name'][ row['name'].find('-')+1 : ]

if row['indexed_by'] is not None:
row['indexed_by'] = row['indexed_by'][ row['indexed_by'].find('-')+1 : ]

if ensure_unique_names:
i = 1
while (row['name'] if i == 1 else row['name'] + "-" + str(i)) in names:
i += 1

names[row['name'] if i == 1 else row['name'] + "-" + str(i)] = 1

if i > 1:
row['name'] = row['name'] + "-" + str(i)

if make_database_safe:
row['name'] = row['name'].replace("-","_")


return lines

def process_cobol(lines):
return clean_names(denormalize_cobol(parse_cobol(clean_cobol(lines))), True, True, True)

# Prints a Copybook compatible file
def print_cobol(lines):
output = []

default_padding = ' ' * 7

levels = [0]

for row in lines:
row_output = []

if row['level'] > levels[-1]:
levels.append(row['level'])
else:
while row['level'] < levels[-1]:
levels.pop()

row_output.append( (len(levels)-1) * ' ' )
row_output.append( "{0:02d} ".format(row['level']) )
row_output.append( row['name'])

if row['indexed_by'] is not None:
row_output.append(" INDEXED BY "+row['indexed_by'])

if row['occurs'] is not None:
row_output.append( " OCCURS {0:04d} TIMES".format(row['occurs']) )

if row['pic'] is not None:
row_output.append( " PIC " + row['pic'] )

row_output.append(".")

tot_length = 0
max_data_length = 66
outp = default_padding

for data in row_output:

if len(outp) + len(data) + 1 > max_data_length:
# Makes rows 80 chars
outp += (80-len(outp)) * ' '

output.append(outp)

# Start the following line with an extra padding
outp = default_padding + (len(levels)-1) * ' ' + ' '

outp += data

outp += (80-len(outp)) * ' '
output.append(outp)

print "\n".join(output)

if __name__ == '__main__':
import argparse
import os.path

parser = argparse.ArgumentParser(description="Parse COBOL Copybooks")
parser.add_argument("filename", help="The filename of the copybook.")
parser.add_argument("--skip-all-processing", help="Only processes the redefines.", default=False, action="store_true")
parser.add_argument("--skip-unique-names", help="Skips making all names unique.", default=False, action="store_true")
parser.add_argument("--skip-denormalize", help="Skips denormalizing the COBOL.", default=False, action="store_true")
parser.add_argument("--skip-strip-prefix", help="Skips stripping the prefix from the names.", default=False, action="store_true")

args = parser.parse_args()

if not os.path.isfile(args.filename):
print "Could not find", args.filename
exit()

with open(args.filename,'r') as f:
lines = parse_cobol(clean_cobol(f.readlines()))

if not args.skip_all_processing:
if not args.skip_denormalize:
lines = denormalize_cobol(lines)

if not args.skip_strip_prefix or not args.skip_unique_names:
lines = clean_names(lines, not args.skip_unique_names, not args.skip_strip_prefix)

print_cobol(lines)
9 changes: 9 additions & 0 deletions example.cbl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
00000 * Example COBOL Copybook file AAAAAAAA
00000 01 PAULUS-EXAMPLE-GROUP. AAAAAAAA
00000 05 PAULUS-ANOTHER-GROUP OCCURS 0003 TIMES. AAAAAAAA
00000 10 PAULUS-FIELD-1 PIC X(3). AAAAAAAA
00000 10 PAULUS-FIELD-2 REDEFINES PAULUS-FIELD-1 PIC 9(3). AAAAAAAA
00000 10 PAULUS-FIELD-3 OCCURS 0002 TIMES AAAAAAAA
00000 PIC S9(3)V99. AAAAAAAA
00000 05 PAULUS-THIS-IS-ANOTHER-GROUP. AAAAAAAA
00000 10 PAULUS-YES PIC X(5). AAAAAAAA
5 changes: 5 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import cobol

with open("example.cbl",'r') as f:
for row in cobol.process_cobol(f.readlines()):
print row['name']
Loading

0 comments on commit 3e96427

Please sign in to comment.