view Lib/BNFCompiler.pm @ 40:ac21a032e7a9

bnf parser in progress
author Sergey
date Thu, 10 Dec 2009 17:43:39 +0300
parents 03e58a454b20
children 16ada169ca75
line wrap: on
line source

package BNFCompiler;
package BNFCompiler::DOM;
package BNFCompiler::DOM::Builder;
package BNFCompiler::DOM::Node;
use strict;

package BNFCompiler::EventParser;
use strict;
use lib '.';
use Common;
our @ISA = qw(Object);

our $EventMapSchema = {
    Description => 'Parser events',
    Type => 'HASH',
    Values => 'SCALAR'
};

BEGIN {
    DeclareProperty(EventMap => ACCESS_READ);
    DeclareProperty(CompiledEvents => ACCESS_NONE);
    DeclareProperty(Handler => ACCESS_ALL);
}

sub CTOR {
    my ($this,%args) = @_;
    $this->SUPER::CTOR(%args);
}

sub Compile {
    my ($this) = @_;
    
    delete $this->{$CompiledEvents};
    while (my ($key,$val) = each %{$this->{$EventMap}}) {
        $this->{$CompiledEvents}{$key} = qr/\G$val/;
    }
    1;
}

sub Parse {
    my ($this,$data) = @_;
    
    my $StateData;
    OUTER: while(pos($data) < length($data)) {
        keys %{$this->{$CompiledEvents}};
        while (my ($event,$match) = each %{$this->{$CompiledEvents}}) {
            if ($data =~ m/($match)/gc) {
                $StateData .= $1;
                eval {
                    undef $StateData if $this->{$Handler}->($event,$StateData);
                };
                if ($@) {
                    die ["Invalid syntax","unexpected $event: $1",pos($data)];
                }
                next OUTER;
            }
        }
        die ["Invalid syntax",substr($data,pos($data),10),pos($data)];
    }
    
    return 1;
}

# небольшая уловка, поскольку ref от регулярного выражения есть Regexp, можно поставить хуки
package Regexp;
use Data::Dumper;

sub STORABLE_freeze {
    my ($obj,$cloning) = @_;
    
    return $obj;
}

sub STORABLE_attach {
    my($class, $cloning, $serialized) = @_;
    return qr/$serialized/;
}

package BNFCompiler;
use Common;
use Storable;
use Data::Dumper;
our @ISA = qw(Object);

our $BNFSchema;
my $ParseAgainstSchema;
my $TransformDOMToBNF;

BEGIN {
    DeclareProperty(Schema => ACCESS_NONE);
    DeclareProperty(SchemaCache => ACCESS_NONE);
    DeclareProperty(Transform => ACCESS_NONE);
}

sub CTOR {
    my $this = shift;
    $this->SUPER::CTOR(@_);
    
    $this->{$SchemaCache} .= '/' if ($this->{$SchemaCache} and not $this->{$SchemaCache} =~ /\/$/);
}
{
    my $compiledBNFSchema;
    sub LoadBNFSchema {
        my ($this,%args) = @_;
        
        my $CompileBNFText = sub {
            my ($this,$text) = @_;
            
            my %SchemaDOM;
            foreach my $item (split /\n{2,}/, $text) {
                next if not $item;
                $compiledBNFSchema = CompileBNFSchema($BNFSchema) if not $compiledBNFSchema;
                my $context = new BNFCompiler::DOM::Builder();
                eval {
                    my $expr = &$ParseAgainstSchema($compiledBNFSchema,$item,$context);
                    die ["Unexpected expression", $expr] if $expr;
                };
                if ($@) {
                    if (ref $@ eq 'ARRAY') {
                        die new Exception(@{$@});
                    } else {
                        die $@;
                    }
                }
                
                $SchemaDOM{$context->Document->selectNodes('name')->text()} = &$TransformDOMToBNF($context->Document->selectNodes('def'));
                
            }
            
            $SchemaDOM{'separator'} = ['re:\\s+'];
            $this->{$Schema} = CompileBNFSchema(\%SchemaDOM);
        };
        
        my $text;
        if ($args{'file'}) {
            
            my $fnameCached;
            if ($this->{$SchemaCache}) {
                my $fname = $args{'file'};
                $fname =~ tr/\//_/;
                $fnameCached = $this->{$SchemaCache}.$fname.'.cbs';
                if ( -e $fnameCached && -f $fnameCached && ( -M $args{'file'} >= -M $fnameCached )) {
                    my $compiledSchema = retrieve($fnameCached);
                    if ($compiledSchema) {
                        $this->{$Schema} = $compiledSchema;
                        return 1;
                    } else {
                        unlink $fnameCached;
                    }
                }
            }
            open my $hFile, '<', $args{'file'} or die new Exception("Failed to open file",$args{'file'},$!);
            local $/ = undef;
            my $text = <$hFile>;
            
            $this->$CompileBNFText($text);
            
            if ($fnameCached) {
                store($this->{$Schema},$fnameCached);
            }
        } elsif ($args{'Schema'}) {
            $this->{$Schema} = CompileBNFSchema($args{'Schema'});
            return 1;
        } elsif ($args{'text'}) {
            $this->$CompileBNFText( $args{'text'} );
        } else {
            die new Exception("'file', 'text' or 'Schema' parameter required");
        }
        
    }
}

sub Parse {
    my ($this, $string, %flags) = @_;
    
    my $context = new BNFCompiler::DOM::Builder;
    
    eval {
        my $err;
        $err = &$ParseAgainstSchema($this->{$Schema},$string,$context,\%flags) and die new Exception('Failed to parse',substr($err,0,80).' ...');
    };
    if ($@) {
        if (ref $@ eq 'ARRAY') {
            die new Exception(@{$@});
        } else {
            die $@;
        }
    }
    if (not $this->{$Transform}) {
        return $context->Document;
    } else {
        return $this->{$Transform}->($context->Document);
    }
}

sub Dispose {
    my ($this) = shift;
    CleanSchema($this->{$Schema});
    delete @$this{$Schema, $Transform};
    $this->SUPER::Dispose;
}

sub CleanSchema {
    my ($schema,$table) = @_;
    
    UNIVERSAL::isa($schema,'ARRAY') or return;
    $table or $table = { $schema, 1};
    
    for(my $i=0; $i<@$schema;$i++) {
        my $item = $schema->[$i];
        if (ref $item) {
            next if $table->{$item};
            $table->{$item} = 1;
            if (UNIVERSAL::isa($item,'ARRAY')) {
                CleanSchema($item,$table);
            } elsif( UNIVERSAL::isa($item,'HASH')) {
                CleanSchema($item->{'syntax'},$table);
            }
            undef $schema->[$i];
        }
    }
}


sub OPT {
    return bless [@_], 'OPT';
}

sub SWITCH {
    return bless [@_], 'SWITCH';
}

sub REPEAT {
    return bless [@_], 'REPEAT';
}

$TransformDOMToBNF = sub {
    my ($nodeRoot) = @_;
    
    return [grep $_, map {
        my $nodeName = $_->nodeName;
        if (not $nodeName ){
            my $obj = $_;
            $obj->text() if (not( grep { $obj->text() eq $_} ('{', '}', '[', ']') ) );
        }elsif($nodeName eq 'name') {
            $_->text();
        } elsif ($nodeName eq 'separator') {
            OPT('separator');
        } elsif ($nodeName eq 'or_sep') {
            # nothing
        } elsif ($nodeName eq 'switch_part') {
            &$TransformDOMToBNF($_);
        } elsif ($nodeName eq 'class') {
            my $class = $_->childNodes->[0]->text();
            
            $class =~ s{(^<|>$|\\.|[\]\[])}{
                my $char = { '>' => '', '<' => '', '[' => '\\[', ']' => '\\]', '\\\\' => '\\\\'}->{$1};
                defined $char ? $char : ($1 =~ tr/\\// && $1);
            }ge;
            $class = '['.$class.']';
            $class .= $_->childNodes->[1]->text() if $_->childNodes->[1];
            're:'.$class;
        } elsif ($nodeName eq 'symbol') {
            $_->text();
        } elsif ($nodeName eq 'simple') {
            @{&$TransformDOMToBNF($_)};
        } elsif ($nodeName eq 'multi_def') {
            @{&$TransformDOMToBNF($_)};
        } elsif ($nodeName eq 'optional') {
            my $multi_def = &$TransformDOMToBNF($_);
            if ($multi_def->[scalar(@{$multi_def})-1] eq '...') {
                pop @{$multi_def};
                OPT(REPEAT(@{$multi_def}));
            } else {
                OPT(@{$multi_def});
            }
        } elsif ($nodeName eq 'switch') {
            SWITCH(@{&$TransformDOMToBNF($_)});
        } elsif ($nodeName eq 'def') {
            @{&$TransformDOMToBNF($_)};
        } else{
            die "unknown nodeName: $nodeName";
        }
    } @{$nodeRoot->childNodes}];
};

$BNFSchema = {
    syntax => ['name',OPT('separator'),'::=',OPT('separator'),'def'],    
    name => ['re:\\w+'],    
    class => ['re:<([^<>\\\\]|\\\\.)+>',OPT('re:\\*|\\+|\\?|\\{\\d+\\}')],
    symbol => ['re:[^\\w\\d\\s\\[\\]{}<>\\\\|]+'],
    separator => ['re:\\s+'],
    simple => [
                SWITCH(
                    'name',
                    'class',
                    'symbol'
                )
    ],
    multi_def => [
        OPT('separator'), SWITCH('...',[SWITCH('simple','optional','switch'),OPT('multi_def')])
    ],
    optional => [
        '[','multi_def', OPT('separator') ,']'

    ],
    keyword => [],
    or_sep => ['|'],
    switch_part => [OPT('separator'),SWITCH('simple','optional','switch'),OPT(REPEAT(OPT('separator'),SWITCH('simple','optional','switch'))),OPT('separator')],
    switch => [
        '{','switch_part',OPT(REPEAT('or_sep','switch_part')),'}'
    ],
    def => [REPEAT(OPT('separator'),SWITCH('simple','optional','switch'))]
};

my $CompileTerm;
$CompileTerm = sub {
    my ($term,$Schema,$cache,$ref) = @_;
    
    my $compiled = ref $term eq 'ARRAY' ? ($ref or []) : bless (($ref or []), ref $term);
    
    die new Exception("Invalid term type $term", $term, ref $term) if not grep ref $term eq $_, qw(ARRAY REPEAT SWITCH OPT);
    
    foreach my $element (@{$term}) {
        if (ref $element) {
            push @{$compiled}, &$CompileTerm($element,$Schema,$cache);
        } else {
            if($element =~/^\w+$/) {
                if (exists $Schema->{$element}) {
                    # reference
                    my $compiledUnit;
                    if (exists $cache->{$element}) {
                        $compiledUnit = $cache->{$element};
                    } else {
                        $compiledUnit = [];
                        $cache->{$element} = $compiledUnit;
                        &$CompileTerm($Schema->{$element},$Schema,$cache,$compiledUnit);
                    }
                    
                    push @{$compiled},{ name => $element, syntax => $compiledUnit};
                } else {
                    # simple word
                    push @{$compiled}, $element;
                }
            } elsif ($element =~ /^re:(.*)/){
                # regexp
                push @{$compiled},qr/\G(?:$1)/;
            } else {
                # char sequence
                push @{$compiled},$element;
            }            
        }
    }
    
    return $compiled;
};

sub CompileBNFSchema {
    my($Schema) = @_;
    
    my %Cache;
    return &$CompileTerm($Schema->{'syntax'},$Schema,\%Cache);
}

my $CompiledSchema = CompileBNFSchema($BNFSchema);

$ParseAgainstSchema = sub {
    my ($Schema,$expression,$context,$flags,$level) = @_;
    
    $level = 0 if not defined $level;
    my $dbgPrint = $flags->{debug} ? sub {
        print "\t" x $level, @_,"\n";
    } : sub {};
    
    foreach my $elem (@{$Schema}) {
        my $type = ref $elem;
        $expression = substr $expression,pos($expression) if $type ne 'Regexp' and pos($expression);
        
        if ($type eq 'HASH') {
            $context->NewContext($elem->{'name'});
            &$dbgPrint("$elem->{name} ", join(',',map { ref $_ eq 'HASH' ? $_->{name} : $_ }@{$elem->{'syntax'}}));
            eval {
                $expression = &$ParseAgainstSchema($elem->{'syntax'},$expression,$context,$flags,$level+1);
            };
            if ($@) {
                $context->EndContext(0);
                &$dbgPrint("/$elem->{name} ", "0");
                die $@;
            } else {
                &$dbgPrint("/$elem->{name} ", "1");
                $context->EndContext(1);
            }
        } elsif ($type eq 'ARRAY') {
            &$dbgPrint("entering ", join(',',map { ref $_ eq 'HASH' ? $_->{name} : $_ }@{$elem}));
            $expression = &$ParseAgainstSchema($elem,$expression,$context,$flags,$level+1);
            &$dbgPrint("success");
        } elsif ($type eq 'OPT') {
            if (defined $expression) {
                &$dbgPrint("optional ",join(',',map { ref $_ eq 'HASH' ? $_->{name} : $_ }@{$elem}));
                eval {
                    $expression = &$ParseAgainstSchema($elem,$expression,$context,$flags,$level+1);
                };
                if ($@) {
                    &$dbgPrint("failed");
                    undef $@;
                } else {
                    &$dbgPrint("success");
                }
            }
        } elsif ($type eq 'SWITCH') {
            my $success = 0;
            &$dbgPrint("switch");
            LOOP_SWITCH: foreach my $subelem (@{$elem}) {
                eval {
                    &$dbgPrint("\ttry ",join(',',map { ref $_ eq 'HASH' ? $_->{name} : $_ } @{(grep ref $subelem eq $_, qw(ARRAY SWITCH OPT REPEAT)) ? $subelem : [$subelem]}));
                    $expression = &$ParseAgainstSchema((grep ref $subelem eq $_, qw(ARRAY SWITCH OPT REPEAT)) ? $subelem : [$subelem],$expression,$context,$flags,$level+1);
                    $success = 1;
                };
                if ($@) {
                    undef $@;
                } else {
                    last LOOP_SWITCH;
                }
            }
            if ($success) {
                &$dbgPrint("success");
            } else {
                &$dbgPrint("failed");
                die ["syntax error",$expression,$elem];
            }
        } elsif ($type eq 'REPEAT') {
            my $copy = [@{$elem}];
            my $i = 0;
            &$dbgPrint("repeat ",join(',',map { ref $_ eq 'HASH' ? $_->{name} : $_ }@{$elem}));
            while (1) {
                eval {
                    $expression = &$ParseAgainstSchema($copy,$expression,$context,$flags,$level+1);
                    $i++;
                };
                if ($@) {
                    if (not $i) {
                        &$dbgPrint("failed");
                        die $@;
                    }
                    &$dbgPrint("found $i matches");
                    undef $@;
                    last;
                }
            }
        } elsif ($type eq 'Regexp') {
                my $posPrev = pos($expression) || 0;
                if ( $expression =~ m/($elem)/ ) {
                    $context->Data($1);
                    pos($expression) = $posPrev+length($1);
                    &$dbgPrint("Regexp: $1 $elem ", pos($expression));
                } else {
                    &$dbgPrint("Regexp: $elem failed");
                    die ["syntax error",$expression,$elem,$posPrev];
                    pos($expression) = $posPrev;
                }
        } else {
            if ((my $val = substr($expression, 0, length($elem),'')) eq $elem) {
                &$dbgPrint("Scalar: $val");
                $context->Data($elem);
            } else {
                &$dbgPrint("Scalar: failed $val expected $elem");
                die ["syntax error",$val.$expression,$elem];
            }
        }
        
    }
    
    if (pos $expression) {
        return substr $expression,(pos($expression) || 0);
    } else {
        return $expression;
    }
    
};

package BNFCompiler::DOM::Node;
use Common;
our @ISA = qw(Object);

sub NODE_TEXT { 1 }
sub NODE_ELEM { 2 }

BEGIN {
    DeclareProperty(nodeName => ACCESS_READ);
    DeclareProperty(nodeType => ACCESS_READ);
    DeclareProperty(nodeValue => ACCESS_READ);
    DeclareProperty(childNodes => ACCESS_READ);
    DeclareProperty(isComplex => ACCESS_READ);
}

sub CTOR {
    my ($this,%args) = @_;
    $args{'nodeType'} = NODE_ELEM if not $args{'nodeType'};
    die new Exception("Invalid args. nodeName reqired.") if $args{'nodeType'} == NODE_ELEM and not $args{nodeName};
    
    #for speed reason
    #$this->SUPER::CTOR(%args);
    
    $this->{$nodeName} = $args{'nodeName'} if $args{'nodeName'};
    $this->{$nodeType} = $args{'nodeType'};
    $this->{$nodeValue} = $args{'nodeValue'} if exists $args{'nodeValue'};
    
    $this->{$isComplex} = 0;
}

sub insertNode {
    my ($this,$node,$pos) = @_;
    
    die new Exception("Invalid operation on text node.") if $this->{$nodeType} != NODE_ELEM;
    die new Exception("Invalid node type",ref $node) if ref $node ne __PACKAGE__;
    
    $this->{$childNodes} = [] if not $this->{$childNodes};
    
    $pos = scalar(@{$this->{$childNodes}}) if not defined $pos;
    die new Exception("Index out of range",$pos) if $pos > scalar(@{$this->{$childNodes}}) or $pos < 0;
    
    splice @{$this->{$childNodes}},$pos,0,$node;
    $this->{$isComplex} = 1 if not $this->{$isComplex} and $node->{$nodeType} == NODE_ELEM;
    
    return $node;
}

sub removeNode {
    my ($this,$node) = @_;
    
    die new Exception("Invalid operation on text node.") if $this->{$nodeType} != NODE_ELEM;
    @{$this->{$childNodes}} = grep { $_ != $node } @{$this->{$childNodes}};
    
    return $node;
}

sub removeAt {
    my ($this,$pos) = @_;
    
    die new Exception("Invalid operation on text node.") if $this->{$nodeType} != NODE_ELEM;
    die new Exception("Index out of range",$pos) if $pos >= scalar(@{$this->{$childNodes}}) or $pos < 0;
    
    return splice @{$this->{$childNodes}},$pos,1;
}

sub selectNodes {
    my ($this,$name) = @_;
    
    die new Exception("Invalid operation on text node.") if $this->{$nodeType} != NODE_ELEM;
    
    my @nodes = grep { $_->{$nodeType} == NODE_ELEM and $_->{$nodeName} eq $name } @{$this->{$childNodes}};
    
    if (wantarray) {
        return @nodes;
    } else {
        return shift @nodes;
    }
}

sub text {
    my $this = shift;
    
    if ($this->{$nodeType} == NODE_TEXT) {
        return $this->{$nodeValue};
    } else {
        my @texts;
        
        foreach my $node (@{$this->{$childNodes}}) {
            push @texts, $node->{$nodeValue} if ($node->{$nodeType}==NODE_TEXT);
        }
        
        if (wantarray) {
            return @texts;
        } else {
            return join '',@texts;
        }
    }
}

package BNFCompiler::DOM::Builder;
use Common;
our @ISA=qw(Object);

BEGIN {
    DeclareProperty(Document => ACCESS_READ);
    DeclareProperty(currentNode => ACCESS_NONE);
    DeclareProperty(stackNodes => ACCESS_NONE);
}

sub CTOR {
    my $this = shift;
    
    $this->{$Document} = new BNFCompiler::DOM::Node(nodeName => 'Document', nodeType => BNFCompiler::DOM::Node::NODE_ELEM);
    $this->{$currentNode} = $this->{$Document};
}

sub NewContext {
    my ($this,$contextName) = @_;
        
    push @{$this->{$stackNodes}},$this->{$currentNode};
    $this->{$currentNode} = new BNFCompiler::DOM::Node(nodeName => $contextName, nodeType=> BNFCompiler::DOM::Node::NODE_ELEM);

    return 1;
}
sub EndContext{
    my ($this,$isNotEmpty) = @_;
    
    if ($isNotEmpty) {
        my $child = $this->{$currentNode};
        $this->{$currentNode} = pop @{$this->{$stackNodes}};
        $this->{$currentNode}->insertNode($child);
    } else {
        $this->{$currentNode} = pop @{$this->{$stackNodes}};
    }
}
sub Data {
    my ($this,$data) = @_;
    $this->{$currentNode}->insertNode(new BNFCompiler::DOM::Node(nodeType=> BNFCompiler::DOM::Node::NODE_TEXT, nodeValue => $data));
}

package BNFCompiler::DOM;

sub TransformDOMToHash {
    my ($root,$options) = @_;
    
    my %content;
    
    if (not $root->childNodes) {
        die;
    }
    
    foreach my $child (@{$root->childNodes}) {
        if ($child->nodeType == BNFCompiler::DOM::Node::NODE_ELEM) {
            my @newValue;
            my $nodeName = $child->nodeName;
            next if $nodeName eq 'separator' and $options->{'skip_spaces'};
            if ($child->isComplex) {
                $newValue[0] = TransformDOMToHash($child,$options);
            } else {
                @newValue = $child->text()
            }
            
            if ($options->{'use_arrays'}) {
                push @{$content{$nodeName}},@newValue;
            }
            
            if (exists $content{$nodeName}) {
                if (ref $content{$nodeName} eq 'ARRAY') {
                    push @{$content{$nodeName}}, @newValue;
                } else {
                    $content{$nodeName} = [$content{$nodeName},@newValue];
                }
            } else {
                $content{$nodeName} = $newValue[0] if scalar(@newValue) == 1;
                $content{$nodeName} = \@newValue if scalar(@newValue) > 1;
            }
        } else {
            next if $options->{'skip_text'};
            push @{$content{'_text'}},$child->nodeValue();
        }
    }
    
    return \%content;
}

1;