Source code for etude

from __future__ import print_function

import sys
import logging as log

from tqdm import tqdm

import glob
import os
## TODO - use warnings
import warnings

import re
import json

import args_and_configs
import scoring_metrics
import text_extraction

#############################################
## helper functions
#############################################

[docs]def count_ref_set( this_ns , this_dd , this_patterns ,
                   this_folder , 
                   args ,
                   file_prefix = '/' ,
                   file_suffix = '.xml' ,
                   set_type = None ):
    log.debug( "Entering '{}'".format( sys._getframe().f_code.co_name ) )
    """
    Count annotation occurrences in the test folder
    """
    type_counts = scoring_metrics.new_score_card( fuzzy_flags = [ 'counts' ] )
    file_list = set([os.path.basename(x) for x in glob.glob( this_folder +
                                                             file_prefix +
                                                             '*' +
                                                             file_suffix )])
    ##########################
    for this_filename in tqdm( sorted( file_list ) ,
                               file = args.progressbar_file ,
                               disable = args.progressbar_disabled ):
        try:
            this_full_path = os.path.join( this_folder ,
                                           this_filename )
            this_om , this_ss = \
              text_extraction.extract_annotations( this_full_path ,
                                                   namespaces = this_ns ,
                                                   document_data = this_dd ,
                                                   patterns = this_patterns ,
                                                   out_file = None )
        except KeyError as e:
            log.error( 'KeyError exception in extract_annotations:  {}'.format( e ) )
        except NameError as e:
            log.error( 'NameError exception in extract_annotations:  {}'.format( e ) )
        except TypeError as e:
            log.error( 'TypeError exception in extract_annotations:  {}'.format( e ) )
        except KeyboardInterrupt as e:
            log.error( 'KeyboardInterrupt in extract_annotations:  {}'.format( e ) )
            sys.exit( 0 )
        except:
            e = sys.exc_info()[0]
            log.error( 'Uncaught exception in extract_annotations:  {}'.format( e ) )
        for this_start in this_ss:
            ## loop over all entries sharing the same start position
            ## and grab type and end position
            for this_entry in this_ss[ this_start ]:
                this_type = this_entry[ 'type' ]
                if( this_start == -1 ):
                    this_end = -1
                    sub_type = this_entry[ 'pivot_value' ]
                    ## TODO - don't force the pivot value into the attribute name
                    this_type = '{} = "{}"'.format( this_type , this_entry[ 'pivot_value' ] )
                else:
                    this_end = this_entry[ 'end_pos' ]
                    sub_type = None
                ##
                ##print( '{}\n'.format( this_type ) )
                scoring_metrics.update_score_card( 'Tally' , type_counts , 'counts' ,
                                                   this_filename , this_start , this_end ,
                                                   this_type , pivot_value = sub_type )

    ##
    if( args.csv_out and
        os.path.exists( args.csv_out ) ):
        os.remove( args.csv_out )
    ##
    try:
        scoring_metrics.print_counts_summary( type_counts ,
                                              sorted( file_list ) ,
                                              this_patterns ,
                                              args ,
                                              set_type = set_type )
    except AttributeError as e:
            log.error( 'AttributeError exception in print_counts_summary:  {}'.format( e ) )
    except KeyError as e:
            log.error( 'KeyError exception in print_counts_summary:  {}'.format( e ) )
    except NameError as e:
            log.error( 'NameError exception in print_counts_summary:  {}'.format( e ) )
    except TypeError as e:
            log.error( 'TypeError exception in print_counts_summary:  {}'.format( e ) )
    except:
        e = sys.exc_info()[0]
        log.error( 'Uncaught exception in print_counts_summary:  {}'.format( e ) )
    #########
    log.debug( "-- Leaving '{}'".format( sys._getframe().f_code.co_name ) )


[docs]def collect_files( reference_folder , test_folder ,
                   file_prefix , file_suffix ,
                   skip_missing_files_flag ):
    log.debug( "Entering '{}'".format( sys._getframe().f_code.co_name ) )
    file_mapping = {}
    match_count = 0
    ##
    reference_filenames = set([os.path.basename(x) for x in glob.glob( reference_folder +
                                                                       file_prefix +
                                                                       '*' +
                                                                       file_suffix[ 0 ] )])
    for reference_filename in sorted( reference_filenames ):
        if( len( file_suffix ) == 1 ):
            test_filename = reference_filename
        else:
            test_filename = re.sub( file_suffix[ 0 ] + '$' ,
                                    file_suffix[ 1 ] ,
                                    reference_filename )
        if( os.path.exists( os.path.join( test_folder ,
                                          test_filename ) ) ):
            match_count += 1
            file_mapping[ reference_filename ] = test_filename
        else:
            if( skip_missing_files_flag ):
                    log.debug( "Skipping file because no test equivalent:  {} -/-> {}".format( reference_filename ,
                                                                                               test_filename ) )
            else:
                file_mapping[ reference_filename ] = None
    ##
    log.debug( "-- Leaving '{}'".format( sys._getframe().f_code.co_name ) )
    return( match_count , file_mapping )


[docs]def count_chars_profile( reference_ns , reference_dd , reference_folder ,
                         test_ns , test_dd , test_folder ,
                         args ,
                         file_prefix = '/' ,
                         file_suffix = '.xml' ):
    log.debug( "Entering '{}'".format( sys._getframe().f_code.co_name ) )
    """
    Extract a character profile for each document and corpus as a whole.
    """
    try:
        match_count , file_mapping = collect_files( reference_folder , test_folder ,
                                                    file_prefix , file_suffix ,
                                                    args.skip_missing_files )
    except:
        e = sys.exc_info()[0]
        log.error( 'Uncaught exception in collect_files:  {}'.format( e ) )
    ##
    if( match_count == 0 ):
        ## Empty dictionaries evaluate to False so testing bool can tell us if
        ## any reference documents exist
        if( bool( file_mapping ) ):
            print( 'ERROR:  No documents found in test directory:  {}'.format( test_folder ) )
        else:
            print( 'ERROR:  No documents found in reference directory:  {}'.format( reference_folder ) )
        return( None )
    ##
    for reference_filename in tqdm( sorted( file_mapping.keys() ) ,
                                    file = args.progressbar_file ,
                                    disable = args.progressbar_disabled ):
        ##
        reference_out_file = generate_out_file( args.reference_out ,
                                                reference_filename )
        ##
        try:
            reference_chars = \
              text_extraction.extract_chars( os.path.join( reference_folder ,
                                                           reference_filename ) ,
                                             namespaces = reference_ns ,
                                             document_data = reference_dd )
        except:
            e = sys.exc_info()[0]
            log.error( 'Uncaught exception in extract_chars:  {}'.format( e ) )
        test_filename = file_mapping[ reference_filename ]
        if( test_filename == None ):
            test_chars = {}
        else:
            ##
            test_out_file = generate_out_file( args.test_out ,
                                               test_filename )
            ##
            try:
                test_full_path = os.path.join( test_folder ,
                                               test_filename )
                test_chars = \
                  text_extraction.extract_chars( test_full_path ,
                                                 namespaces = test_ns ,
                                                 document_data = test_dd )
            except:
                e = sys.exc_info()[0]
                log.error( 'Uncaught exception in extract_chars:  {}'.format( e ) )
        ##
    log.debug( "-- Leaving '{}'".format( sys._getframe().f_code.co_name ) )


[docs]def align_tokens(  reference_folder ,
                   test_folder ,
                   args ,
                   file_prefix = '/' ,
                   file_suffix = '.xml' ):
    """
    Align reference and test documents by token for comparison
    """
    match_count , file_mapping = collect_files( reference_folder , test_folder ,
                                                file_prefix , file_suffix ,
                                                args.skip_missing_files )
    ##
    if( match_count == 0 ):
        ## Empty dictionaries evaluate to False so testing bool can tell us if
        ## any reference documents exist
        if( bool( file_mapping ) ):
            print( 'ERROR:  No documents found in test directory:  {}'.format( test_folder ) )
        else:
            print( 'ERROR:  No documents found in reference directory:  {}'.format( reference_folder ) )
        return( None )
    ##
    for reference_filename in tqdm( sorted( file_mapping.keys() ) ,
                                    file = args.progressbar_file ,
                                    disable = args.progressbar_disabled ):
        ##
        reference_out_file = generate_out_file( args.reference_out ,
                                                reference_filename )
        ##
        reference_dictionary = {}
        with open( os.path.join( reference_folder ,
                                 reference_filename ) , 'r' ) as fp:
            reference_dictionary = json.load( fp )
        text_extraction.align_tokens_on_whitespace( reference_dictionary ,
                                                    reference_out_file )
        test_filename = file_mapping[ reference_filename ]
        if( test_filename != None ):
            ##
            test_out_file = generate_out_file( args.test_out ,
                                               reference_filename )
            ##
            test_dictionary = {}
            with open( os.path.join( test_folder ,
                                     test_filename ) , 'r' ) as fp:
                test_dictionary = json.load( fp )
            text_extraction.align_tokens_on_whitespace( test_dictionary ,
                                                        test_out_file )
    ##


[docs]def get_file_mapping( reference_folder , test_folder ,
                      file_prefix , file_suffix ,
                      skip_missing_files_flag ):
    log.debug( "Entering '{}'".format( sys._getframe().f_code.co_name ) )
    """
    Create mapping between folders to see which files in each set need to be compared
    """
    try:
        match_count , file_mapping = collect_files( reference_folder , test_folder ,
                                                    file_prefix , file_suffix ,
                                                    skip_missing_files_flag )
    except:
        e = sys.exc_info()[0]
        log.error( 'Uncaught exception in collect_files:  {}'.format( e ) )
    ##
    if( match_count == 0 ):
        ## Empty dictionaries evaluate to False so testing bool can tell us if
        ## any reference documents exist
        if( bool( file_mapping ) ):
            log.error( 'No documents found in test directory:  {}'.format( test_folder ) )
        else:
            log.error( 'No documents found in reference directory:  {}'.format( reference_folder ) )
        return( None )
    ##
    log.debug( "-- Leaving '{}'".format( sys._getframe().f_code.co_name ) )
    return( file_mapping )


[docs]def create_output_folders( reference_out , test_out ):
    log.debug( "Entering '{}'".format( sys._getframe().f_code.co_name ) )
    """
    Create output folders for saving the results of our analysis
    """
    ##########################
    ## Reference folders
    if( reference_out != None and
        not os.path.exists( reference_out ) ):
        log.warning( 'Creating reference output folder because it does not exist:  {}'.format( reference_out ) )
        try:
            os.makedirs( reference_out )
        except OSError as e:
            log.error( 'OSError caught while trying to create reference output folder:  {}'.format( e ) )
        except IOError as e:
            log.error( 'IOError caught while trying to create reference output folder:  {}'.format( e ) )
    ##########################
    ## Test (system output) folders
    if( test_out != None and
        not os.path.exists( test_out ) ):
        log.warning( 'Creating test output folder because it does not exist:  {}'.format( test_out ) )
        try:
            os.makedirs( test_out )
        except OSError as e:
            log.error( 'OSError caught while trying to create test output folder:  {}'.format( e ) )
        except IOError as e:
            log.error( 'IOError caught while trying to create test output folder:  {}'.format( e ) )
    #########
    log.debug( "-- Leaving '{}'".format( sys._getframe().f_code.co_name ) )


[docs]def generate_out_file( output_dir , input_filename ):
    """
    Generate a well-formed full file path for writing output stats
    """
    if( output_dir == None ):
        return( None )
    else:
        ## TODO - replace this and all path generation strings with
        ##        OS generic version
        return( os.path.join( output_dir ,
                              input_filename ) )


[docs]def score_ref_set( reference_ns , reference_dd , reference_patterns , reference_folder ,
                   test_ns , test_dd , test_patterns , test_folder ,
                   args ,
                   file_prefix = '/' ,
                   file_suffix = '.xml' ):
    log.debug( "Entering '{}'".format( sys._getframe().f_code.co_name ) )
    """
    Score the system output (test) folder against the reference folder.
    """
    score_card = scoring_metrics.new_score_card( fuzzy_flags = \
                                                   args.fuzzy_flags ,
                                                 normalization_engines = \
                                                   args.scorable_engines )
    ##
    confusion_matrix = {}
    ##########################
    file_mapping = get_file_mapping( reference_folder , test_folder ,
                                     file_prefix , file_suffix ,
                                     args.skip_missing_files )
    if( file_mapping == None ):
        ## There was a problem mapping files between directories so abort
        return( None )
    ##########################
    create_output_folders( args.reference_out , args.test_out )
    ##########################
    for reference_filename in tqdm( sorted( file_mapping.keys() ) ,
                                    file = args.progressbar_file ,
                                    disable = args.progressbar_disabled ):
        ##
        reference_out_file = generate_out_file( args.reference_out ,
                                                reference_filename )
        ##
        try:
            reference_full_path = os.path.join( reference_folder ,
                                                reference_filename )
            reference_om , reference_ss = \
              text_extraction.extract_annotations( reference_full_path ,
                                                   namespaces = reference_ns ,
                                                   document_data = reference_dd ,
                                                   patterns = reference_patterns ,
                                                   skip_chars = args.skip_chars ,
                                                   out_file = reference_out_file )
        except KeyError as e:
            log.error( 'KeyError exception in extract_annotations:  {}'.format( e ) )
        except NameError as e:
            log.error( 'NameError exception in extract_annotations:  {}'.format( e ) )
        except IndexError as e:
            log.error( 'IndexError exception in extract_annotations:  {}'.format( e ) )
        except TypeError as e:
            log.error( 'TypeError exception in extract_annotations:  {}'.format( e ) )
        except KeyboardInterrupt as e:
            log.error( 'KeyboardInterrupt in extract_annotations:  {}'.format( e ) )
            sys.exit( 0 )
        except:
            e = sys.exc_info()[0]
            log.error( 'Uncaught exception in extract_annotations:  {}'.format( e ) )
        test_filename = file_mapping[ reference_filename ]
        if( test_filename == None ):
            test_om = {}
            test_ss = {}
        else:
            ##
            test_out_file = generate_out_file( args.test_out ,
                                               test_filename )
            ##
            test_full_path = os.path.join( test_folder ,
                                           test_filename )
            try:
                test_om , test_ss = \
                  text_extraction.extract_annotations( test_full_path ,
                                                       namespaces = test_ns ,
                                                       document_data = test_dd ,
                                                       patterns = test_patterns ,
                                                       skip_chars = \
                                                         args.skip_chars ,
                                                       out_file = test_out_file )
            except KeyError as e:
                log.error( 'KeyError exception in extract_annotations:  {}'.format( e ) )
            except TypeError as e:
                log.error( 'TypeError exception in extract_annotations:  {}'.format( e ) )
            except KeyboardInterrupt as e:
                log.error( 'KeyboardInterrupt in extract_annotations:  {}'.format( e ) )
                sys.exit( 0 )
            except:
                e = sys.exc_info()[0]
                log.error( 'Uncaught exception in extract_annotations:  {}'.format( e ) )
        ##
        try:
            if( args.skip_chars == None ):
                ignore_chars = False
            else:
                ignore_chars = True
            ## Stricly enforce the constraint that 'start', 'end', and
            ## 'doc-property' match flags must be run individually on
            ## their own runs
            if( 'start' in args.fuzzy_flags ):
                scoring_metrics.evaluate_positions( reference_filename ,
                                                    confusion_matrix ,
                                                    score_card ,
                                                    reference_ss ,
                                                    test_ss ,
                                                    fuzzy_flag = 'start' ,
                                                    use_mapped_chars = \
                                                        ignore_chars ,
                                                    scorable_attributes = \
                                                        args.scorable_attributes ,
                                                    scorable_engines = \
                                                        args.scorable_engines ,
                                                    norm_synonyms =\
                                                        args.normalization_synonyms )
            elif( 'end' in args.fuzzy_flags ):
                scoring_metrics.evaluate_positions( reference_filename ,
                                                    confusion_matrix ,
                                                    score_card ,
                                                    reference_ss ,
                                                    test_ss ,
                                                    fuzzy_flag = 'end' ,
                                                    use_mapped_chars = \
                                                        ignore_chars ,
                                                    scorable_attributes = \
                                                        args.scorable_attributes ,
                                                    scorable_engines = \
                                                        args.scorable_engines ,
                                                    norm_synonyms =\
                                                        args.normalization_synonyms )
            elif( 'doc-property' in args.fuzzy_flags ):
                scoring_metrics.evaluate_doc_properties( reference_filename ,
                                                         confusion_matrix ,
                                                         score_card ,
                                                         reference_ss ,
                                                         test_ss ,
                                                         patterns = reference_patterns ,
                                                         fuzzy_flag = 'doc-property' ,
                                                         scorable_attributes = \
                                                             args.scorable_attributes ,
                                                         scorable_engines = \
                                                             args.scorable_engines ,
                                                         norm_synonyms =\
                                                             args.normalization_synonyms )
            else:
                for fuzzy_flag in args.fuzzy_flags:
                    scoring_metrics.evaluate_positions( reference_filename ,
                                                        confusion_matrix ,
                                                        score_card ,
                                                        reference_ss ,
                                                        test_ss ,
                                                        fuzzy_flag = fuzzy_flag ,
                                                        use_mapped_chars = \
                                                            ignore_chars ,
                                                        scorable_attributes = \
                                                            args.scorable_attributes ,
                                                        scorable_engines = \
                                                            args.scorable_engines ,
                                                        norm_synonyms =\
                                                            args.normalization_synonyms )
        except UnboundLocalError as e:
            log.error( 'UnboundLocalError exception in evaluate_positions:  {}'.format( e ) )
        except NameError as e:
            log.error( 'NameError exception in evaluate_positions:  {}'.format( e ) )
        except TypeError as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            log.error( 'TypeError in evaluate_positions ({}):  {}'.format( exc_tb.tb_lineno , e ) )
        except ValueError as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            log.error( 'TypeError in evaluate_positions ({}):  {}'.format( exc_tb.tb_lineno , e ) )
        except KeyboardInterrupt as e:
            log.error( 'KeyboardInterrupt in extract_annotations:  {}'.format( e ) )
            sys.exit( 0 )
        except:
            e = sys.exc_info()[0]
            log.error( 'Uncaught exception in evaluate_positions:  {}'.format( e ) )
    ##
    if( args.csv_out and
        os.path.exists( args.csv_out ) ):
        os.remove( args.csv_out )
    ##
    # scoring_metrics.print_counts_summary_shell( confusion_matrix ,
    #                                             file_mapping ,
    #                                             reference_patterns , test_patterns ,
    #                                             args = args )
    if( args.print_confusion_matrix ):
        scoring_metrics.print_confusion_matrix_shell( confusion_matrix ,
                                                      file_mapping ,
                                                      reference_patterns , test_patterns ,
                                                      args = args )
    if( args.print_metrics ):
        scoring_metrics.print_score_summary_shell( score_card ,
                                                   file_mapping ,
                                                   reference_patterns , test_patterns ,
                                                   args = args )
    if( '2018 n2c2 track 1' in args.print_custom ):
        scoring_metrics.print_2018_n2c2_track1( score_card ,
                                                file_mapping ,
                                                args = args )
    #########
    log.debug( "-- Leaving '{}'".format( sys._getframe().f_code.co_name ) )


[docs]def init_args():
    ##
    args = args_and_configs.get_arguments( sys.argv[ 1: ] )
    ## Set up logging
    if args.verbose:
        log.basicConfig( format = "%(levelname)s: %(message)s" ,
                         level = log.DEBUG )
        log.info( "Verbose output." )
        log.debug( "{}".format( args ) )
    else:
        log.basicConfig( format="%(levelname)s: %(message)s" )
    ## Configure progressbar peformance
    if( args.progressbar_output == 'none' ):
        args.progressbar_disabled = True
        args.progressbar_file = None
    else:
        args.progressbar_disabled = False
        if( args.progressbar_output == 'stderr' ):
            args.progressbar_file = sys.stderr
        elif( args.progressbar_output == 'stdout' ):
            args.progressbar_file = sys.stdout
    ## F-score beta values are commonly set to 1, 2, and 0.5 but we
    ## want to support any values.  It's easiest to do filtering at
    ## this point in the pipeline to standardize beta values and how
    ## they show up in the pipeline
    if( 'F' in args.metrics_list ):
        f_position = args.metrics_list.index( 'F' )
        args.metrics_list.pop( f_position )
        if( len( args.f_beta_values ) == 0 ):
            log.warning( 'F was included in the list of metrics to calculate but no beta values were provided (--f-beta-values <betas>)' )
        else:
            ## Reverse the list so that they get inserted into the metrics_list
            ## in the proper order
            args.f_beta_values.reverse()
            for beta in args.f_beta_values:
                if( 'F{}'.format( beta ) not in args.metrics_list ):
                    args.metrics_list.insert( f_position , 'F{}'.format( beta ) )
    else:
        if( len( args.f_beta_values ) > 0 ):
            log.warning( 'F beta values were provided but "F" was not included in the list of metrics to calculate (--f-beta-values <betas>)' )
            args.f_beta_values = []
    for common_beta in [ '1' , '2' , '0.5' ]:
        if( 'F{}'.format( common_beta ) in args.metrics_list ):
            if( common_beta not in args.f_beta_values ):
                args.f_beta_values.append( common_beta )
    ## The command line parameters are always initially cast as strings.
    ## That works fine for some empty values.  Sometimes we want to use
    ## 0 (int) or 0.0 (float) or -1 as empty values.  In this case,
    ## it's best to cast the string to the appropriate numerical
    ## type for formatting later.
    if( args.empty_value is not None and
        args.empty_value != '' ):
        try:
            args.empty_value = int( args.empty_value )
        except ValueError:
            log.debug( 'Default empty_value is not an int' )
            try:
                args.empty_value = float( args.empty_value )
            except ValueError:
                log.debug( 'Default empty_value is not a float' )
    ## Resolve conflicts between --ignore-whitespace, --heed-whitespace,
    ## and --ignore-regex flags.  Essentially, if we set something in
    ## skip_chars, use that.  Otherwise, if we tripped --ignore_whitespace
    ## then set skip_chars accordingly
    if( args.ignore_whitespace and
        args.skip_chars == None ):
        args.skip_chars = r'[\s]'
    ## lstrip hack added to handle prefixes and suffixes with dashes
    ##   https://stackoverflow.com/questions/16174992/cant-get-argparse-to-read-quoted-string-with-dashes-in-it
    args.file_prefix = args.file_prefix.lstrip()
    args.file_suffix[ 0 ] = args.file_suffix[ 0 ].lstrip()
    if( len( args.file_suffix ) == 2 ):
        args.file_suffix[ 1 ] = args.file_suffix[ 1 ].lstrip()
    ## Initialize the list of annotation attributes to score
    args.attributes_list = []
    args.scorable_attributes = []
    if( isinstance( args.attributes_string , str ) ):
        for attribute_key in args.attributes_string.split( ',' ):
            ## Strip off any extra whitespace before processing
            attribute_key = attribute_key.strip()
            attribute_kernel = attribute_key.split( '/' )
            last = len( attribute_kernel ) - 1
            args.attributes_list.append( [ attribute_kernel[ 0 ] ,
                                           attribute_kernel[ last ] ] )
    ## Initialize the list of normalization engines to score
    args.normalization_list = []
    args.scorable_engines = []
    args.normalization_synonyms = {}
    if( isinstance( args.normalization_string , str ) ):
        for normalization_key in args.normalization_string.split( ',' ):
            ## Strip off any extra whitespace before processing
            normalization_key = normalization_key.strip()
            normalization_kernel = normalization_key.split( '/' )
            last = len( normalization_kernel ) - 1
            args.normalization_list.append(  [ normalization_kernel[ 0 ] ,
                                               normalization_kernel[ last ] ] )
        ## Only bother to load the normalization_file if the --score-normalization
        ## flag was used
        args.normalization_synonyms = \
          args_and_configs.process_normalization_file( args.normalization_file )
    ## Initialize the corpuse settings, values, and metrics file
    ## if it was provided at the command line
    if( args.corpus_out ):
        ## Clean out any previous corpus dictionary, in case it exists from
        ## an old run
        with open( args.corpus_out , 'w' ) as fp:
            json.dump( {} , fp , sort_keys = True , indent = 4 )
        ## Add a few important arguments
        scoring_metrics.update_output_dictionary( args.corpus_out ,
                                                  [ 'args' ] ,
                                                  [ 'reference_config' ,
                                                    'reference_input' ,
                                                    'reference_out' ,
                                                    'test_config' ,
                                                    'test_input' ,
                                                    'test_out' ,
                                                    'score_key' ,
                                                    'fuzzy_flags' ] ,
                                                  [ args.reference_config ,
                                                    args.reference_input ,
                                                    args.reference_out ,
                                                    args.test_config ,
                                                    args.test_input ,
                                                    args.test_out ,
                                                    args.score_key ,
                                                    args.fuzzy_flags ] )
    return args

if __name__ == "__main__":
    ##
    args = init_args()
    ## Extract and process the two input file configs
    if( args.reference_input ):
        try:
            reference_ns , reference_dd , reference_patterns = \
              args_and_configs.process_config( config_file = args.reference_config ,
                                               score_key = args.score_key ,
                                               score_values = args.score_values ,
                                               collapse_all_patterns = args.collapse_all_patterns ,
                                               verbose = args.verbose )
        except args_and_configs.configparser.NoOptionError as e:
            log.error( 'NoOptionError in process_config for reference config:  {}'.format( e ) )
        except NameError as e:
            log.error( 'NameError in process_config for reference config:  {}'.format( e ) )
        except:
            e = sys.exc_info()[0]
            log.error( 'Uncaught exception in process_config for reference config:  {}'.format( e ) )
        if( reference_patterns == [] ):
            log.error( 'No reference patterns extracted from config.  Bailing out now.' )
            exit( 1 )
    if( args.test_input ):
        try:
            test_ns , test_dd , test_patterns = \
              args_and_configs.process_config( config_file = args.test_config ,
                                               score_key = args.score_key ,
                                               score_values = args.score_values ,
                                               collapse_all_patterns = args.collapse_all_patterns ,
                                               verbose = args.verbose )
        except args_and_configs.configparser.NoOptionError as e:
            log.error( 'NoOptionError in process_config for system output config:  {}'.format( e ) )
        except NameError as e:
            log.error( 'NameError in process_config for system output config:  {}'.format( e ) )
        except:
            e = sys.exc_info()[0]
            log.error( 'Uncaught exception in process_config for system output config:  {}'.format( e ) )
        if( test_patterns == [] ):
            log.error( 'No test patterns extracted from config.  Bailing out now.' )
            exit( 1 )
    if( args.reference_input and args.test_input ):
        try:
            reference_patterns , test_patterns = \
              args_and_configs.align_patterns( reference_patterns , test_patterns ,
                                               collapse_all_patterns = args.collapse_all_patterns )
            if( len( reference_patterns ) == 0 ):
                log.error( 'Zero annotation patterns found in reference config after filtering against system output config.' )
                exit( 1 )
            if( len( test_patterns ) == 0 ):
                log.error( 'Zero annotation patterns found in system output config after filtering against reference config.' )
                exit( 1 )
        except:
            e = sys.exc_info()[0]
            log.error( 'Uncaught exception in align_patterns:  {}'.format( e ) )
    ## Get the intersection of attributes defined in the ref and sys patterns
    ## along with those listed in the --score-attributes argument
    if( args.attributes_string is not None ):
        ## TODO - filter the scorable attributes based on just a single
        ##        reference or test input pattern base
        if( args.reference_input and args.test_input ):
            try:
                unique_reference_attributes = \
                  args_and_configs.unique_attributes( reference_patterns )
                unique_test_attributes = \
                  args_and_configs.unique_attributes( test_patterns )
            except AttributeError as e :
                log.error( 'AttributeError in unique_attributes:  {}'.format( e ) )
            except TypeError as e :
                log.error( 'TypeError in unique_attributes:  {}'.format( e ) )
            try:
                if( args.attributes_list == [] ):
                    ## No attributes explicitly listed so we use the intersection
                    ## of listed attributes
                    for attribute in sorted( list( unique_reference_attributes &
                                                   unique_test_attributes ) ):
                        args.scorable_attributes.append( [ attribute , attribute ] )
                else:
                    ## A list type means that a filtered list of attributes were provided
                    ## as arguments to the command line
                    for attribute_pair in args.attributes_list:
                        if( attribute_pair[ 0 ] in unique_reference_attributes and
                            attribute_pair[ 1 ] in unique_test_attributes ):
                            args.scorable_attributes.append( attribute_pair )
                if( len( args.scorable_attributes ) == 0 ):
                    log.error( 'Zero annotation attributes match between the reference and system pattern definitions. Correct your configs or provide mappings between attribute spellings.' )
                    exit( 1 )
            except TypeError as e :
                exc_type, exc_obj, exc_tb = sys.exc_info()
                log.error( 'TypeError in scorable attribute creation (ln {}):  {}'.format( exc_tb.tb_lineno , e ) )
    ## Get the intersection of normalization engines defined in the ref
    ## and sys document data settings along with those listed in the
    ## --score-normalization argument
    if( args.normalization_string is not None ):
        if( args.reference_input and 'normalization_engines' in reference_dd and
            args.test_input and 'normalization_engines' in test_dd ):
            try:
                if( args.normalization_list == [] ):
                    ## No engines explicitly listed so we use the intersection
                    ## of document data defined engines
                    for normalization_engine in sorted( list( set( reference_dd[ 'normalization_engines' ] ) &
                                                              set( test_dd[ 'normalization_engines' ] ) ) ):
                        args.scorable_engines.append( [ normalization_engine , normalization_engine ] )
                else:
                    ## A list type means that a filtered list of attributes were provided
                    ## as arguments to the command line
                    for engine_pair in args.normalization_list:
                        if( engine_pair[ 0 ] in reference_dd[ 'normalization_engines' ] and
                            engine_pair[ 1 ] in test_dd[ 'normalization_engines' ] ):
                            args.scorable_engines.append( engine_pair )
                if( len( args.scorable_engines ) == 0 ):
                    log.error( 'Zero normalization engines match between the reference and system document data definitions. Correct your configs' )
                    exit( 1 )
            except TypeError as e :
                exc_type, exc_obj, exc_tb = sys.exc_info()
                log.error( 'TypeError in scorable attribute creation (ln {}):  {}'.format( exc_tb.tb_lineno , e ) )
    ##
    if( args.align_tokens ):
        align_tokens( reference_folder = os.path.abspath( args.reference_input ) ,
                      test_folder = os.path.abspath( args.test_input ) ,
                      args = args ,
                      file_prefix = args.file_prefix ,
                      file_suffix = args.file_suffix )
    else:
        ## TODO - make a more efficient loop that will count and/or score in a single pass
        ##        rather than doing a full double pass
        if( args.print_counts ):
            if( args.reference_input ):
                try:
                    count_ref_set( this_ns = reference_ns ,
                                   this_dd = reference_dd ,
                                   this_patterns = reference_patterns ,
                                   this_folder = os.path.abspath( args.reference_input ) ,
                                   args = args ,
                                   file_prefix = args.file_prefix ,
                                   file_suffix = args.file_suffix[ len( args.file_suffix ) - 1 ] ,
                                   set_type = 'reference' )
                except AttributeError as e:
                    log.error( 'AttributeError exception in count_ref_set for reference output corpus:  {}'.format( e ) )
                except KeyError as e:
                    log.error( 'KeyError in count_ref_set for reference output corpus:  {}'.format( e ) )
                except NameError as e:
                    log.error( 'NameError in count_ref_set for reference output corpus:  {}'.format( e ) )
                except TypeError as e:
                    log.error( 'TypeError in count_ref_set for reference output corpus:  {}'.format( e ) )
                except:
                    e = sys.exc_info()[0]
                    log.error( 'Uncaught exception in count_ref_set for reference output corpus:  {}'.format( e ) )
            ##
            if( args.test_input ):
                try:
                    count_ref_set( this_ns = test_ns ,
                                   this_dd = test_dd ,
                                   this_patterns = test_patterns ,
                                   this_folder = os.path.abspath( args.test_input ) ,
                                   args = args ,
                                   file_prefix = args.file_prefix ,
                                   file_suffix = args.file_suffix[ len( args.file_suffix ) - 1 ] ,
                                   set_type = 'test' ) 
                except AttributeError as e:
                    log.error( 'AttributeError exception in count_ref_set for reference output corpus:  {}'.format( e ) )
                except KeyError as e:
                    log.error( 'KeyError in count_ref_set for system output corpus:  {}'.format( e ) )
                except NameError as e:
                    log.error( 'NameError in count_ref_set for system output corpus:  {}'.format( e ) )
                except TypeError as e:
                    log.error( 'TypeError in count_ref_set for system output corpus:  {}'.format( e ) )
                except:
                    e = sys.exc_info()[0]
                    log.error( 'Uncaught exception in count_ref_set for system output corpus:  {}'.format( e ) )

        ##
        if( args.print_confusion_matrix or
            args.print_metrics or
            len( args.print_custom ) > 0 ):
            try:
                score_ref_set( reference_ns = reference_ns ,
                               reference_dd = reference_dd ,
                               reference_patterns = reference_patterns ,
                               reference_folder = os.path.abspath( args.reference_input ) ,
                               test_ns = test_ns ,
                               test_dd = test_dd ,
                               test_patterns = test_patterns ,
                               test_folder = os.path.abspath( args.test_input ) ,
                               args = args ,
                               file_prefix = args.file_prefix ,
                               file_suffix = args.file_suffix )
            except NameError as e:
                log.error( 'NameError in score_ref_set:  {}'.format( e ) )
            except IndexError as e:
                log.error( 'IndexError in score_ref_set:  {}'.format( e ) )
            except KeyError as e:
                log.error( 'KeyError in score_ref_set:  {}'.format( e ) )
            except ValueError as e:
                log.error( 'ValueError in score_ref_set:  {}'.format( e ) )
            except TypeError as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                log.error( 'TypeError in score_ref_set ({}):  {}'.format( exc_tb.tb_lineno , e ) )
            except:
                e = sys.exc_info()[0]
                log.error( 'Uncaught exception in score_ref_set:  {}'.format( e ) )