COBOL copybook parser

This Python-based COBOL copybook parser command accepts stdin or a filename, it reads in the copybook text file and returns CSV to stdout in the following format:

  1. Field Name (concatenated names based on level hierarchy)
  2. Data Type (‘Integer’, ‘Float’, ‘Double’ or ‘BCD’)
  3. Field Length
  4. Implied decimal position (unit-based, i.e 1st char.  = 1)

Handles repeating nested field groups (layered OCCURS) and variable length records (OCCURS … DEPENDING ON).  Looped fields are indented with ‘\t’ character(s), reference example output.

Keeping the copybook parser loosely coupled from the data record parser allows for post-processing of the file layout, such as piping to Unix commands for text manipulation, manual edits such as adding ‘*’ in front of field names to identify the key fields used when importing data into a relational database, etc.

Copybook Parser Source Code

Note: This code has been superceded by the pyCOBOL package, updated code is available at Sourceforge.

import copy
import re
import sys
from optparse import *
from pic import PIC
"""
Accepts: COPYBOOK Filename or STDIN
Repeating blocks (OCCURS) & variable length records (DEPENDING ON)
Returns Comma Separated Values (CSV) via STDOUT:
1. Flattened data, concatenated fields names built from level hierarchy.
2. Returns Field Name, Data Type, Length & Implied Decimal Position
3. Pythonic indentation/syntax for repeating blocks 'OCCURS x TIMES:'
   where x is an integer indicating number to times to repeat or ...
   where 'x' is a string/field name that the number of times depends on.
"""

class Field:
    def __init__(self):
        # 2-digit level, name, OCCURS, DEPENDING ON, PIC, COMP
        FIELD_PATTERN = r'^(?P<level>\d{2})\s+(?P<name>[\S-]+)'
        PIC_PATTERN = r'\s+PIC\s+(?P<pic>[-/+$,(0-9):A-Z*]+)'
        OCCURS_DEPENDS_PATTERN = r'\s+OCCURS.*DEPENDING ON (?P<occurs>[\S-]+)'
        OCCURS_PATTERN = r'\s+OCCURS (?P<occurs>\d+) TIMES'
        COMP_PATTERN = r'\s+COMP-(?P[123])'
        FIELD_PATTERNS = [
            '%s%s%s.' % (FIELD_PATTERN, PIC_PATTERN, COMP_PATTERN),
            '%s%s.' % (FIELD_PATTERN, OCCURS_DEPENDS_PATTERN),
            '%s%s%s%s.' % (FIELD_PATTERN, OCCURS_PATTERN, PIC_PATTERN, COMP_PATTERN),
            '%s%s%s.' % (FIELD_PATTERN, OCCURS_PATTERN, PIC_PATTERN),
            '%s%s.' % (FIELD_PATTERN, OCCURS_PATTERN),
            '%s%s.' % (FIELD_PATTERN, PIC_PATTERN),
            '%s.' % (FIELD_PATTERN),
        ]
        self.FIELD_PATTERNS = [ re.compile(i) for i in FIELD_PATTERNS ]
        self.FIELDS = [
            'occurs', 'level', 'name',
            'type', 'length', 'decimal_pos',
            'pic', 'comp'
        ]
        self.picture = PIC()

    def parse(self, line):
        fields = { 'name': '', 'level': '0', 'occurs': '1', 'comp': '0' }
        line_num, line = line
        num_patterns = len(self.FIELD_PATTERNS)
        pattern_num = 0
        match = False
        while (not match) and (pattern_num < num_patterns):
            match = self.FIELD_PATTERNS[pattern_num].match(line)
            if match:
                for key, value in match.groupdict().items():
                    fields[key] = value
            pattern_num += 1
        if fields['occurs'].isdigit():
            fields['occurs'] = int(fields['occurs'])
        for i in ['level', 'comp']:
            fields[i] = int(fields[i])
        result = [ fields[i] for i in self.FIELDS[:3] ]
        if fields.has_key('pic'):
            result.extend(self.picture.parse(fields['pic'], fields['comp']))
        return result

class COPYBOOK:
    def __str__(self):
        return '\n'.join(','.join([i for i in self.fields ]))

    def parse_fields(self, lines):
        COMMENT_CHAR = '*'
        TERMINATE_CHAR = '.'
        lines = [ i.strip() for i in lines ]
        lines = [ i for i in lines if i ]
        lines = [ i for i in lines if i[0] != COMMENT_CHAR ]
        lines = [ i for i in enumerate(lines) ]
        field = Field()
        self.fields = [ field.parse(i) for i in lines ]

    def nested_field_names(self):
        LEVEL, NAME = 1, 2
        names={}
        fields = []
        for field in self.fields:
            # Levels 66 & 88 not supported
            if field[LEVEL] not in (66, 88):
                names[field[LEVEL]] = field[NAME]
                s = sorted(names.items())
                name = [ j for i, j in s if field[LEVEL] >= i ]
                field[NAME] = '_'.join(name)
                fields.append(field)
        self.fields = fields

    def occurs_n_times(self):
        f = []
        OCCURS, LEVEL, FIELDS = 0, 1, 1
        levels = [0]
        for field in self.fields:
            if field[LEVEL] <= levels[-1]:
                levels.pop()
            if str(field[OCCURS]) != '1':
                print '%sOCCURS %r TIMES:' % ('\t' * (len(levels) - 1), field[OCCURS])
                levels.append(field[LEVEL])
            if len(field) > 3:
                print '%s%s' % ('\t' * (len(levels) - 1), ', '.join([ str(i) for i in field[2:] ]))

def load_copybook(lines):
    copybook = COPYBOOK()
    copybook.parse_fields(lines)
    copybook.nested_field_names()
    copybook.occurs_n_times()
    return copybook

if __name__ == '__main__':
    ver = "COPYBOOK Parser 0.8\nWritten by Brian Peterson."
    usage = "usage: copybook.py [FILE]"
    parser = OptionParser(usage, version=ver)
    (options, args) = parser.parse_args()
    if args:
        # Read data from FILE
        lines = open(args[0]).readlines()
    else:
        # No FILE arg found, read data from STDIN
        lines = sys.stdin.readlines()
    copybook = load_copybook(lines)

Sample Copybook File

Because of the sensitive nature of the actual data I am working with, I have chosen to substitute the following copybook example:

01  STUDENT.
  20  ID                               PIC 9(8).
 20  FIRST_NAME                        PIC X(32).
*                                      YYYYMMDD
  20  DATE_OF_BIRTH                    PIC S9(8) COMP.
  20  NUMOF_COURSES                    PIC 9(4) COMP.
  20  NUMOF_BOOKS                      PIC 9(4) COMP.
    25  COURSES OCCURS 8 TIMES DEPENDING ON NUMOF_COURSES.
      30  COURSE_ID                    PIC 9(8).
      30  COURSE_TITLE                 PIC X(48).
      30  INSTRUCTOR_ID                PIC 9(8).
      30  NUMOF_ASSIGNMENTS            PIC 9(4) COMP.
      30  ASSIGNMENTS OCCURS 4 TIMES DEPENDING ON NUMOF_ASSIGNMENTS.
        40  ASSIGNMENT_TYPE            PIC x(12).
        40  ASSIGNMENT_TITLE           PIC X(48).
*                                      YYYYMMDD
        40  DUE_DATE                   PIC S9(8) COMP.
        40  GRADE                      PIC S9V9.
  20 BOOKS.
    25  BOOK OCCURS 1 TO 5 TIMES DEPENDING ON NUMOF_BOOKS.
      30  ISBN                         PIC X(10).
*                                      YYYMMDD.
      30  RETURN_DATE                  PIC 9(8) COMP.

Sample Output

STUDENT_ID, Integer, 8, 0
STUDENT_FIRST_NAME, Char, 32, 0
STUDENT_DATE_OF_BIRTH, Integer, 9, 0
STUDENT_NUMOF_COURSES, Integer, 4, 0
STUDENT_NUMOF_BOOKS, Integer, 4, 0
OCCURS 'NUMOF_COURSES' TIMES:
        STUDENT_NUMOF_BOOKS_COURSES_COURSE_ID, Integer, 8, 0
        STUDENT_NUMOF_BOOKS_COURSES_COURSE_TITLE, Char, 48, 0
        STUDENT_NUMOF_BOOKS_COURSES_INSTRUCTOR_ID, Integer, 8, 0
        STUDENT_NUMOF_BOOKS_COURSES_NUMOF_ASSIGNMENTS, Integer, 4, 0
        OCCURS 'NUMOF_ASSIGNMENTS' TIMES:
                STUDENT_NUMOF_BOOKS_COURSES_ASSIGNMENTS_ASSIGNMENT_TITLE, Char, 48, 0
                STUDENT_NUMOF_BOOKS_COURSES_ASSIGNMENTS_DUE_DATE, Integer, 9, 0
                STUDENT_NUMOF_BOOKS_COURSES_ASSIGNMENTS_GRADE, Float, 4, 3
OCCURS 'NUMOF_BOOKS' TIMES:
        STUDENT_BOOKS_BOOK_ISBN, Char, 10, 0
        STUDENT_BOOKS_BOOK_RETURN_DATE, Integer, 8, 0

Tags: , , , ,

Leave a comment