#!/usr/bin/env perl
use v5.14.1;

use PICA::Data;
use PICA::Schema;
use IO::File;
use Getopt::Long;
use Pod::Usage;

GetOptions(
    'from|f=s'   => \(my $from),
    'to|t:s'     => \(my $to = 0),
    'schema|s=s' => \(my $schema),
    'unknown|u!' => \(my $report_unknown),
    'count|c'    => \(my $count),
    'path|p=s'   => \(my $path),
    'help|?'     => \(my $help)
) or pod2usage(2);
pod2usage(-verbose => 99, -sections => "SYNOPSIS|OPTIONS|DESCRIPTION")
    if $help
    or (!@ARGV and -t);

my %types = (
    bin    => 'Binary',
    dat    => 'Binary',
    binary => 'Binary',
    plain  => 'Plain',
    plus   => 'Plus',
    xml    => 'XML',
    ppxml  => 'PPXML',
    json   => 'JSON',
);

my $fh;
if (@ARGV) {
    if (!$from && $ARGV[0] =~ /\.([a-z]+)$/ && $types{lc $1}) {
        $from = $ARGV[0];
    }
    else {
        $fh = IO::File->new($ARGV[0], "r")
            or die "failed to open '$ARGV[0]'\n";
    }
}

$from = 'plain' unless $from;
pod2usage("unknown serialization type: $from") unless $types{lc $from};

$to = $from if $to eq '';
pod2usage("unknown serialization type: $to") unless !$to or $types{lc $to};

my $parser = "PICA::Parser::${types{$from}}"->new($fh, bless => 1);
my $writer = $to ? "PICA::Writer::${types{$to}}"->new(utf8 => 0) : undef;

my %schema_options = (ignore_unknown => !$report_unknown);

$count = {records => 0, holdings => 0, items => 0, fields => 0,} if $count;

my $invalid = 0;
if ($schema) {
    use JSON::PP qw(decode_json);
    my $fh = IO::File->new($schema);
    $schema = PICA::Schema->new(decode_json(join "\n", <$fh>));
}

if ($path) {
    my $compiled = eval {PICA::Path->new($path)};
    die "invalid pica path: $path\n" unless $compiled;
    $path = $compiled;
}

while (my $record = $parser->next) {
    if ($path) {
        $record = $record->fields($path);
    }

    $writer->write($record) if $writer;
    if ($schema) {
        my @errors = $schema->check($record, %schema_options);
        if (@errors) {
            for (@errors) {
                my $msg
                    = defined $record->{_id} ? $record->{_id} . ": $_" : $_;
                print "$msg\n";
            }
            $invalid++;
        }
    }
    if ($count) {
        $count->{records}++;
        $count->{holdings} += @{$record->holdings};
        $count->{items}    += @{$record->items};
        $count->{fields}   += @{$record->{record}};
    }
}

$writer->end() if $writer && $writer->can('end');

if ($count) {
    $count->{invalid} = $invalid if defined $invalid;
    print $count->{$_} . " $_\n"
        for grep {defined $count->{$_}}
        qw(records invalid holdings items fields);
}

__END__

=head1 NAME

picadata - parse and validate PICA+ data

=head1 SYNOPSIS

picadata [--from TYPE] [--schema FILE] [--to [TYPE]] {OPTIONS} [FILE]

=head1 DESCRIPTION

Parse, validate and/or serialize PICA+ data from the command line, e.g.:

  picadata pica.xml -s schema.json   # validate against Avram schema
  picadata pica.dat -t xml           # convert binary to XML
  picadata -c -f plain < pica.plain  # parse and count records
  picadata -p 003@ pica.xml -t       # extract field 003@

=head1 OPTIONS

=head2 --from, -f

PICA serialization type (plain, plus, binary, XML, ppxml) with XML as default.
Guessed from input filename unless specified. See format documentation at
L<http://format.gbv.de/pica>

=head2 --to, -t

PICA serialization type to enable writing parsed PICA data.

=head2 --count, -c

Count number of records, holdings, items, and fields.

=head2 --schema, -s

L<Avram Schema|http://format.gbv.de/schema/avram/specification> to validate
against.

=head2 --unknown, -u

Report unknown fields and subfields when validating (disabled by default).

=head2 --path, -p

Limit the record to fields specified by a simple PICA Path expression.

=head1 SEE ALSO

See L<Catmandu::PICA> for more elaborated command line processing of PICA+ data.

=cut
