#!/usr/bin/perl
# ===========================================================================
#
# PUBLIC DOMAIN NOTICE
# National Center for Biotechnology Information (NCBI)
#
# This software/database is a "United States Government Work" under the
# terms of the United States Copyright Act. It was written as part of
# the author's official duties as a United States Government employee and
# thus cannot be copyrighted. This software/database is freely available
# to the public for use. The National Library of Medicine and the U.S.
# Government do not place any restriction on its use or reproduction.
# We would, however, appreciate having the NCBI and the author cited in
# any work or product based on this material.
#
# Although all reasonable efforts have been taken to ensure the accuracy
# and reliability of the software and data, the NLM and the U.S.
# Government do not and cannot warrant the performance or results that
# may be obtained by using this software or data. The NLM and the U.S.
# Government disclaim all warranties, express or implied, including
# warranties of performance, merchantability or fitness for any particular
# purpose.
#
# ===========================================================================
#
# File Name: xtract
#
# Author: Jonathan Kans
#
# Version Creation Date: 8/20/12
#
# ==========================================================================
# Entrez Direct - EDirect
# use strict;
use warnings;
# definitions
use constant false => 0;
use constant true => 1;
# xtract version number
$version = "2.00";
# initialize memory hash
my %memory = ();
# initialize synopsis variables
my %synopsis_acc = ();
my $synopsis_max = 0;
# initial level is global
my $initial_level = 7;
# utility subroutines
sub convert_http {
my $str = shift (@_);
$str =~ s/\&\;/\&/g;
$str =~ s/\&apos\;/\'/g;
$str =~ s/\>\;/\>/g;
$str =~ s/\<\;/\ 1 ) {
push (@working, $dec);
} elsif ( $itm eq "INNER" and $indx > 1 and $indx < $sclr ) {
push (@working, $dec);
} elsif ( $itm eq "TAIL" and $indx == $sclr and $sclr > 1 ) {
push (@working, $dec);
# even and odd are based only on index value, not list length
} elsif ( $itm eq "EVEN" and ( $indx % 2) == 0 ) {
push (@working, $dec);
} elsif ( $itm eq "ODD" and ( $indx % 2) == 1 ) {
push (@working, $dec);
}
} elsif ( $itm =~ /^@(.+)/ ) {
# match unqualified @attribute
my $att = $1;
@atts = ($line =~ /<\S+ ([^>]+)>/g);
foreach $val (@atts) {
if ( $val =~ /$att=\"([^\"]+)\"/ ) {
$dec = convert_http($1);
push (@working, $dec);
}
}
} elsif ( $itm =~ /(.+)@(.+)/ ) {
# match qualified tag@attribute
my $tag = $1;
my $att = $2;
@atts = ($line =~ /<$tag ([^>]+)>/g);
foreach $val (@atts) {
if ( $val =~ /$att=\"([^\"]+)\"/ ) {
$dec = convert_http($1);
push (@working, $dec);
}
}
} elsif ( $itm =~ /(.+)\/(.+)/ ) {
# match parent/child by tracking depth level of tokens
my $tag = $1;
my $chd = $2;
@vals = ($line =~ /<$tag(?:\s+.+?)?>(.+?)<\/$tag>/g);
foreach $val (@vals) {
my $lvl = 0;
my @tokens = split (/(?<=>)(?=<)/, $val);
foreach $tkn (@tokens) {
if ( $lvl < 0 ) {
# ignore remainder if level drops below zero
} elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
# content-containing tag does not change level
if ( $tkn =~ /<$chd(?:\s+.+?)?>(.+?)<\/$chd>/ ) {
# child tag matches, only accept if immediately below parent
if ( $lvl == 0 ) {
$dec = convert_http($1);
push (@working, $dec);
}
}
} elsif ( $tkn =~ /\/>$/ ) {
# self-closing token does not change level
} elsif ( $tkn =~ /^<[^\/]/ ) {
# open tag increments level
$lvl++;
} elsif ( $tkn =~ /^<\// ) {
# close tag decrements level
$lvl--;
}
}
}
} elsif ( $itm =~ /(.+)\((.*)\)/ ) {
# match self-closing tag (alias to print is in parentheses)
my $tag = $1;
my $rep = $2;
if ( $rep eq "" ) {
$rep = $tag;
}
# [ is it possible to look for both forms in one pass ? ]
@vals = ($line =~ /(<$tag.*?><\/$tag>)/g);
if ( scalar @vals != 0 ) {
# found alternative form ( ) of self-closing tag
foreach $val (@vals) {
$dec = convert_http($rep);
push (@working, $dec);
}
} else {
# look for compact form ( ) if no match to alternative form
if ( scalar @vals == 0 ) {
@vals = ($line =~ /(<$tag.*?\/>)/g);
foreach $val (@vals) {
$dec = convert_http($rep);
push (@working, $dec);
}
}
}
} else {
# match tag with contents
my $tag = $itm;
@vals = ($line =~ /<$tag(?:\s+.+?)?>(.+?)<\/$tag>/g);
foreach $val (@vals) {
$dec = convert_http($val);
push (@working, $dec);
}
}
my $num = scalar @working;
# if -first, -last, etc., remove all but matching subset
if ( $num < 1 ) {
# no matches collected, skip
} elsif ( $cmmd eq "-first" ) {
@working = splice (@working, 0, 1);
$num = scalar @working;
} elsif ( $cmmd eq "-last" ) {
@working = splice (@working, $num - 1, 1);
$num = scalar @working;
} elsif ( $cmmd eq "-single" ) {
if ( $num > 1 ) {
$num = 0;
}
} elsif ( $cmmd eq "-head" ) {
if ( $num < 2 ) {
$num = 0;
} else {
@working = splice (@working, 0, 1);
$num = scalar @working;
}
} elsif ( $cmmd eq "-inner" ) {
if ( $num < 3 ) {
$num = 0;
} else {
@working = splice (@working, 1, $num - 2);
$num = scalar @working;
}
} elsif ( $cmmd eq "-tail" ) {
if ( $num < 2 ) {
$num = 0;
} else {
@working = splice (@working, $num - 1, 1);
$num = scalar @working;
}
}
# store current results into accumulator
if ( $num == 0 ) {
# no printable results
} elsif ( $do_count ) {
push (@accum, $num);
} elsif ( $do_length ) {
my $len = 0;
foreach $dec (@working) {
$len += length ( $dec );
}
push (@accum, $len);
} elsif ( $cmmd eq "-even" ) {
my $take = false;
foreach $dec (@working) {
if ( $take ) {
push (@accum, $dec);
}
$take = (! $take);
}
} elsif ( $cmmd eq "-odd" ) {
my $take = true;
foreach $dec (@working) {
if ( $take ) {
push (@accum, $dec);
}
$take = (! $take);
}
} else {
foreach $dec (@working) {
push (@accum, $dec);
}
}
}
# process results
if ( scalar @accum == 0 ) {
return false;
} elsif ( $variable ne "") {
# store accumulated components into variable
my $str = $pfx;
my $between = "";
foreach $dec (@accum) {
$str .= "$between";
$between = $sep;
$str .= "$dec";
}
$str .= "$sfx";
$memory{"$variable"} = "$str";
} else {
# format accumulated elements
my $before = $prev . $pfx;
my $between = "";
my $after = "";
foreach $dec (@accum) {
print "$before";
$before = "";
print "$between";
$between = $sep;
$after = $sfx;
print "$dec";
}
print "$after";
return true;
}
return false;
}
sub process_flags {
my $line = shift (@_);
my @args = @{shift (@_)};
my $tab = shift (@_);
my $ret = shift (@_);
my $indx = shift (@_);
my $sclr = shift (@_);
my $pfx = "";
my $sfx = "";
my $sep = "\t";
my $col = "\t";
my $lin = "\n";
my $okay = false;
my $cmmd = "";
my $max = scalar @args;
for ( my $i = 0; $i < $max; $i++ ) {
# read command-line arguments
my $val = $args[$i];
# if new argument starts with hyphen
if ( $val =~ /^\-/ ) {
# not an -element value, so set $okay flag to false
$okay = false;
# all flags should be followed by at least one value
if ( $i + 1 >= $max and $val ne "-outline" and $val ne "-synopsis" ) {
# warn that the last flag has no value
print STDERR "No argument after '$val'\n";
# break out of for loop to return from function
last;
# otherwise safe to increment with $i++ and then dereference $args [$i]
}
}
# print outline of XML structure
if ( $val eq "-outline" ) {
my $lvl = 0;
my $dpth = 0;
my @tokens = split (/(?<=>)(?=<)/, $line);
foreach $tkn (@tokens) {
if ( $lvl < 0 ) {
# ignore remainder if level drops below zero
} elsif ( $tkn =~ /^<.+?>.+?<\/(.+?)>$/ ) {
# content-containing tag
if ( $tkn =~ /^<(\S+).*?>.*>$/ ) {
$tkn = $1;
$lvl++;
$dpth++;
for ( $i = 0; $i < $dpth; $i++ ) {
print " ";
}
print "$tkn\n";
$lvl--;
$dpth--;
}
} elsif ( $tkn =~ /^<.+?\/>$/ ) {
# self-closing token
if ( $tkn =~ /^<(\S+).*?\/>$/ ) {
$tkn = $1;
$lvl++;
$dpth++;
for ( $i = 0; $i < $dpth; $i++ ) {
print " ";
}
print "$tkn\n";
$lvl--;
$dpth--;
}
} elsif ( $tkn =~ /^<[^\/]/ ) {
# open tag increments level
$lvl++;
$dpth++;
if ( $tkn =~ /^<(\S+).*?>$/ ) {
$tkn = $1;
if ( $tkn ne "?xml" and
$tkn ne "!DOCTYPE" and
$tkn ne "eSummaryResult" and
$tkn ne "eInfoResult" ) {
for ( $i = 0; $i < $dpth; $i++ ) {
print " ";
}
print "$tkn\n";
} else {
$dpth--;
}
}
} elsif ( $tkn =~ /^<\// ) {
# close tag decrements level
$lvl--;
$dpth--;
}
}
# collect summary of XML paths, print at end
} elsif ( $val eq "-synopsis" ) {
my @arr = ();
my $lvl = 0;
my @tokens = split (/(?<=>)(?=<)/, $line);
foreach $tkn (@tokens) {
my $record_level = 0;
if ( $lvl < 0 ) {
# ignore remainder if level drops below zero
} elsif ( $tkn =~ /^<.+?>.+?<\/(.+?)>$/ ) {
# content-containing tag
if ( $tkn =~ /^<(\S+).*?>.*>$/ ) {
$tkn = $1;
$lvl++;
$arr[$lvl] = $tkn;
$record_level = $lvl;
$lvl--;
}
} elsif ( $tkn =~ /^<.+?\/>$/ ) {
# self-closing token
if ( $tkn =~ /^<(\S+).*?\/>$/ ) {
$tkn = $1;
$lvl++;
$arr[$lvl] = $tkn;
$record_level = $lvl;
$lvl--;
}
} elsif ( $tkn =~ /^<[^\/]/ ) {
# open tag increments level
$lvl++;
if ( $tkn =~ /^<(\S+).*?>$/ ) {
$tkn = $1;
$arr[$lvl] = $tkn;
$record_level = $lvl;
}
} elsif ( $tkn =~ /^<\// ) {
# close tag decrements level
$lvl--;
}
if ( $record_level > 0 ) {
my $str = "";
my $pfx = "";
for ( $i = 1; $i <= $record_level; $i++ ) {
$itm = $arr[$i];
if ( $itm ne "?xml" and
$itm ne "!DOCTYPE" and
$itm ne "eSummaryResult" and
$itm ne "eInfoResult" ) {
$str .= "$pfx";
$pfx = "/";
$str .= "$arr[$i]";
}
}
if ( $str ne "" ) {
my $val = 0;
if ( exists ($synopsis_acc{"$str"}) ) {
$val = $synopsis_acc{"$str"};
}
$val++;
$synopsis_acc{"$str"} = $val;
if ( $val > $synopsis_max ) {
$synopsis_max = $val;
}
}
}
}
# look for arguments that adjust output formatting
} elsif ( $val eq "-pfx" ) {
$i++;
$pfx = convert_slash ( $args[$i] );
} elsif ( $val eq "-sfx" ) {
$i++;
$sfx = convert_slash ( $args[$i] );
} elsif ( $val eq "-sep" ) {
$i++;
$sep = convert_slash ( $args[$i] );
} elsif ( $val eq "-tab" ) {
$i++;
$col = convert_slash ( $args[$i] );
} elsif ( $val eq "-ret" ) {
$i++;
$lin = convert_slash ( $args[$i] );
} elsif ( $val eq "-lbl" ) {
$i++;
print "$tab";
$lbl = convert_slash ( $args[$i] );
print "$lbl";
$tab = $col;
$ret = $lin;
# look for -element or limiting variants, non-hyphenated arguments to follow
} elsif ( $val eq "-element" or
$val eq "-first" or
$val eq "-last" or
$val eq "-even" or
$val eq "-odd" or
$val eq "-single" or
$val eq "-head" or
$val eq "-inner" or
$val eq "-tail" ) {
$okay = true;
$cmmd = $val;
# hyphen followed by capital letters or digits is a variable
} elsif ( $val =~ /^\-([A-Z0-9]+)([a-z]?)$/ ) {
my $variable = $1;
my $first_or_last = $2;
$cmmd = "-element";
# detect f, l, etc., variable suffix for -first, -last, etc. (undocumented)
if ( $first_or_last eq "f" ) {
$cmmd = "-first";
} elsif ( $first_or_last eq "l" ) {
$cmmd = "-last";
} elsif ( $first_or_last eq "e" ) {
$cmmd = "-even";
} elsif ( $first_or_last eq "o" ) {
$cmmd = "-odd";
} elsif ( $first_or_last eq "s" ) {
$cmmd = "-single";
} elsif ( $first_or_last eq "h" ) {
$cmmd = "-head";
} elsif ( $first_or_last eq "i" ) {
$cmmd = "-inner";
} elsif ( $first_or_last eq "t" ) {
$cmmd = "-tail";
}
# clear the variable to avoid old value persisting if no subsequent match
$memory{"$variable"} = "";
$i++;
$val = $args[$i];
# set variable to new value
if ( $val =~ /^\((.*)\)$/ ) {
# parentheses contain literal value for variable
$val = convert_slash ( $1 );
$memory{"$variable"} = "$val";
} else {
# if XML tag name, record what would otherwise be printed by -element
process_element ( $line, $val, $tab, $pfx, $sfx, $sep, $cmmd, $variable, $indx, $sclr );
}
# reality check on unsupported arguments
} elsif ( $val =~ /^\-/ ) {
print STDERR "Unrecognized argument '$val'\n";
} elsif ( $okay ) {
# process -element values
if ( process_element ( $line, $val, $tab, $pfx, $sfx, $sep, $cmmd, "", $indx, $sclr )) {
$tab = $col;
$ret = $lin;
}
} else {
print STDERR "No -element before '$val'\n";
}
}
return $tab, $ret;
}
sub has_content {
my $line = shift (@_);
my $str = shift (@_);
my $indx = shift (@_);
my $sclr = shift (@_);
if ( $str eq "" or $str eq "&" ) {
return false;
}
# match by non-empty variable value (undocumented)
if ( $str =~ /^&([A-Z0-9]+)$/ ) {
$str = $1;
if ( exists ($memory{"$str"}) and $memory{"$str"} ne "" ) {
return true;
}
return false;
}
# remove leading backslash used to prevent content from being mistaken for variable
$str =~ s/^\\&/&/;
# match by exploration flag (undocumented)
if ( $str =~ /^\?([A-Z]+)/ ) {
$str = $1;
# first and last elements are always present, and are the same for a one-element list
if ( $str eq "FIRST" and $indx == 1 ) {
return true;
} elsif ( $str eq "LAST" and $indx == $sclr ) {
return true;
# single, head, inner, tail are mutually exclusive situations
} elsif ( $str eq "SINGLE" and $sclr == 1 ) {
return true;
} elsif ( $str eq "HEAD" and $indx == 1 and $sclr > 1 ) {
return true;
} elsif ( $str eq "INNER" and $indx > 1 and $indx < $sclr ) {
return true;
} elsif ( $str eq "TAIL" and $indx == $sclr and $sclr > 1 ) {
return true;
# even and odd are based only on index value, not list length
} elsif ( $str eq "EVEN" and ( $indx % 2) == 0 ) {
return true;
} elsif ( $str eq "ODD" and ( $indx % 2) == 1 ) {
return true;
}
}
# remove leading backslash used to prevent content from being mistaken for flag
$str =~ s/^\\\?/?/;
# exact match by data contents, not using regular expression, case insensitive
if ( index ( uc($line), uc($str) ) >= 0 ) {
return true;
}
return false;
}
sub has_element {
my $line = shift (@_);
my $str = shift (@_);
if ( $str eq "" ) {
return false;
}
# match by specific variable value (undocumented)
if ( $str =~ /^\&([A-Z0-9]+)\:(.+)$/ ) {
my $ptrn = $1;
my $chld = $2;
if ( exists ($memory{"$ptrn"}) and $memory{"$ptrn"} eq "$chld" ) {
return true;
}
return false;
}
# match by non-empty variable value (undocumented)
if ( $str =~ /^\&([A-Z0-9]+)$/ ) {
$str = $1;
if ( exists ($memory{"$str"}) and $memory{"$str"} ne "" ) {
return true;
}
return false;
}
# regular expression search for @attribute:value
if ( $str =~ /@(.+)\:(.+)/ ) {
my $attr = $1;
my $chld = $2;
if ( $line =~ /<\S+ ([^>]+)>/i ) {
my $atts = $1;
if ( $atts =~ /$attr=\"$chld\"/ ) {
return true;
}
}
return false;
}
# regular expression search for element@attribute:value
if ( $str =~ /(.+)@(.+)\:(.+)/ ) {
my $ptrn = $1;
my $attr = $2;
my $chld = $3;
if ( $line =~ /<$ptrn ([^>]+)>/i ) {
my $atts = $1;
if ( $atts =~ /$attr=\"$chld\"/ ) {
return true;
}
}
return false;
}
# regular expression search for element@attribute
if ( $str =~ /(.+)@(.+)/ ) {
my $ptrn = $1;
my $attr = $2;
if ( $line =~ /<$ptrn ([^>]+)>/i ) {
my $atts = $1;
if ( $atts =~ /$attr=\"([^\"]+)\"/ ) {
return true;
}
}
return false;
}
# INSDSeq special cases - &Feature:name and &Qualifier:name
if ( $str =~ /^\&Feature\:(.+)/ ) {
$str = "INSDFeature_key:" . "$1";
} elsif ( $str =~ /^\&Qualifier\:(.+)/ ) {
$str = "INSDQualifier_name:" . "$1";
}
# regular expression search for element:value
if ( $str =~ /(.+)\:(.+)/ ) {
my $ptrn = $1;
my $chld = $2;
if ( $line =~ /<$ptrn(?:\s+.+?)?>$chld<\/$ptrn>/i ) {
return true;
}
return false;
}
# regular expression search for element tag
if ( $line =~ /<$str(?:\s+.+?)?>/i ) {
return true;
}
# also test self-closing tag
if ( $line =~ /<$str(?:\s+.+?)?\/>/i ) {
return true;
}
return false;
}
sub process_level {
my $line = shift (@_);
my $par = shift (@_);
my $chd = shift (@_);
my @args = @{shift (@_)};
my $tab = shift (@_);
my $ret = shift (@_);
my $level = shift (@_);
my $indx = shift (@_);
my $sclr = shift (@_);
# handle heterogeneous child blocks if pattern was parent/*
if ( $chd eq "*" and $par ne "" and $par ne "*" ) {
my @vals = ();
my $lvl = 0;
my $in_pat = false;
my $pfx = "";
my @working = ();
if ( $line =~ /^<$par(?:\s+.+?)?>(.+?)<\/$par>$/ ) {
$line = $1;
}
my @tokens = split (/(?<=>)(?=<)/, $line);
foreach $tkn (@tokens) {
if ( $lvl < 0 ) {
# ignore remainder if level drops below zero
} elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
# content-containing tag does not change level
if ( $lvl > 0 ) {
push (@working, $tkn);
} else {
# match to full object, push contents directly to final array
push (@vals, $tkn);
}
} elsif ( $tkn =~ /\/>$/ ) {
# self-closing token does not change level
if ( $lvl > 0 ) {
push (@working, $tkn);
} else {
push (@vals, $tkn);
}
} elsif ( $tkn =~ /^<[^\/]/ ) {
# open tag increments level
push (@working, $tkn);
$lvl++;
} elsif ( $tkn =~ /^<\// ) {
# close tag decrements level
push (@working, $tkn);
$lvl--;
if ( $lvl == 0 ) {
my $str = join ("", @working);
push (@vals, $str);
@working = ();
}
}
}
if ( scalar @working > 0 ) {
my $str = join ("", @working);
push (@vals, $str);
@working = ();
}
foreach $val (@vals) {
if ( $level == $initial_level ) {
print "$pfx";
%memory = ();
}
( $tab, $ret ) = process_level ( $val, "", "", \@args, $tab, $ret, $level, $indx, $sclr );
if ( $level == $initial_level ) {
$pfx = $ret;
$tab = "";
}
}
return $tab, $ret;
}
# track depth level of tokens if pattern was parent/child
if ( $chd ne "" ) {
my @vals = ();
my $lvl = 0;
my $dpth = 0;
my $in_pat = false;
my $pfx = "";
my @working = ();
my @tokens = split (/(?<=>)(?=<)/, $line);
foreach $tkn (@tokens) {
if ( $lvl < 0 ) {
# ignore remainder if level drops below zero
} elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
# content-containing tag does not change level
if ( $in_pat ) {
push (@working, $tkn);
} elsif ( $tkn =~ /<$chd(?:\s+.+?)?>(.+?)<\/$chd>/ and $lvl < 2 ) {
# match to full object, push directly to final array
push (@vals, $tkn);
}
} elsif ( $tkn =~ /\/>$/ ) {
# self-closing token does not change level
if ( $in_pat ) {
push (@working, $tkn);
}
} elsif ( $tkn =~ /^<[^\/]/ ) {
# open tag increments level
if ( $in_pat ) {
push (@working, $tkn);
} elsif ( $tkn =~ /<$chd(?:\s+.+?)?>/ and $lvl < 2 ) {
$in_pat = true;
$dpth = $lvl;
push (@working, $tkn);
}
$lvl++;
} elsif ( $tkn =~ /^<\// ) {
# close tag decrements level
$lvl--;
if ( $in_pat ) {
push (@working, $tkn);
}
if ( $tkn =~ /<\/$chd>/ ) {
if ( $lvl <= $dpth ) {
$in_pat = false;
my $str = join ("", @working);
push (@vals, $str);
@working = ();
}
}
}
}
if ( scalar @working > 0 ) {
my $str = join ("", @working);
push (@vals, $str);
@working = ();
}
foreach $val (@vals) {
if ( $level == $initial_level ) {
print "$pfx";
%memory = ();
}
( $tab, $ret ) = process_level ( $val, "", "", \@args, $tab, $ret, $level, $indx, $sclr );
if ( $level == $initial_level ) {
$pfx = $ret;
$tab = "";
}
}
return $tab, $ret;
}
my $max = scalar @args;
# allow conditional arguments in any order
my $go_on = true;
while ( $go_on ) {
if ( $max > 2 and $args[0] eq "-position" ) {
# -position command filters by object position in list
my $pstn = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
# first and last elements are always present, and are the same for a one-element list
if ( $pstn eq "first" and $indx > 1 ) {
return $tab, $ret;
} elsif ( $pstn eq "last" and $indx < $sclr ) {
return $tab, $ret;
# single, head, inner, tail are mutually exclusive situations
} elsif ( $pstn eq "single" and $sclr > 1 ) {
return $tab, $ret;
} elsif ( $pstn eq "head" and ( $indx > 1 or $sclr == 1 ) ) {
return $tab, $ret;
} elsif ( $pstn eq "inner" and ( $indx == 1 or $indx == $sclr ) ) {
return $tab, $ret;
} elsif ( $pstn eq "tail" and ( $indx < $sclr or $sclr == 1 ) ) {
return $tab, $ret;
# single and multiple are mutually exclusive situations
} elsif ( $pstn eq "multiple" and $sclr == 1 ) {
return $tab, $ret;
# even and odd are based only on index value, not list length
} elsif ( $pstn eq "even" and ( $indx % 2) == 1 ) {
return $tab, $ret;
} elsif ( $pstn eq "odd" and ( $indx % 2) == 0 ) {
return $tab, $ret;
# value is actual numeric index
} elsif ( $pstn =~ /^[0-9]+$/ and $indx != $pstn ) {
return $tab, $ret;
}
} elsif ( $max > 2 and $args[0] eq "-match" ) {
# -match ... -and/-or ... command filters for indicated element tag [:value]
my $required = 1;
my $observed = 0;
my $prevbool = "";
my $mtch = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
if ( has_element ( $line, $mtch )) {
$observed++;
}
# "or" succeeds on any match, "and" requires all tests to match
while ( $max > 2 and ( $args[0] eq "-or" or $args[0] eq "-and" )) {
if ( $prevbool ne "" and $prevbool ne $args[0] ) {
print STDERR "A mixture of -and and -or commands cannot follow -match\n";
return $tab, $ret;
}
$prevbool = $args[0];
if ( $prevbool eq "-and" ) {
$required++;
}
$mtch = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
if ( has_element ( $line, $mtch )) {
$observed++;
}
}
if ( $observed < $required ) {
return $tab, $ret;
}
} elsif ( $max > 2 and $args[0] eq "-avoid" ) {
# -avoid ... -and ... command filters against indicated element tag [:value]
my $skip = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
if ( has_element ( $line, $skip )) {
return $tab, $ret;
}
# colloquial "and" is really logical "or" - any match bails out of function
while ( $max > 2 and $args[0] eq "-and" ) {
$skip = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
if ( has_element ( $line, $skip )) {
return $tab, $ret;
}
}
if ( $max > 1 and $args[0] eq "-or" ) {
print STDERR "The -or command cannot follow -avoid\n";
return $tab, $ret;
}
} elsif ( $max > 2 and $args[0] eq "-present" ) {
# -present ... -and/-or ... command filters for indicated data content
my $required = 1;
my $observed = 0;
my $prevbool = "";
my $mtch = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
if ( has_content ( $line, $mtch, $indx, $sclr )) {
$observed++;
}
# "or" succeeds on any match, "and" requires all tests to match
while ( $max > 2 and ( $args[0] eq "-or" or $args[0] eq "-and" )) {
if ( $prevbool ne "" and $prevbool ne $args[0] ) {
print STDERR "A mixture of -and and -or commands cannot follow -present\n";
return $tab, $ret;
}
$prevbool = $args[0];
if ( $prevbool eq "-and" ) {
$required++;
}
$mtch = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
if ( has_content ( $line, $mtch, $indx, $sclr )) {
$observed++;
}
}
if ( $observed < $required ) {
return $tab, $ret;
}
} elsif ( $max > 2 and $args[0] eq "-absent" ) {
# -absent ... -and ... command filters against indicated data content
my $skip = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
if ( has_content ( $line, $skip, $indx, $sclr )) {
return $tab, $ret;
}
# colloquial "and" is really logical "or" - any match bails out of function
while ( $max > 2 and $args[0] eq "-and" ) {
$skip = $args[1];
$max -= 2;
@args = splice (@args, 2, $max);
if ( has_content ( $line, $skip, $indx, $sclr )) {
return $tab, $ret;
}
}
if ( $max > 1 and $args[0] eq "-or" ) {
print STDERR "The -or command cannot follow -absent\n";
return $tab, $ret;
}
} elsif ( $max > 1 and $args[0] eq "-trim" ) {
# -trim removes first XML tag
while ( $max > 1 and $args[0] eq "-trim" ) {
$max--;
@args = splice (@args, 1, $max);
if ( $line =~ /^<.+?>(.+)$/ ) {
$line = $1;
}
}
} else {
# no other conditional tests or trim commands, break loop
$go_on = false;
}
}
# at level 0 break recursion for nested organizers, process -element
if ( $level < 1 ) {
( $tab, $ret ) = process_flags ( $line, \@args, $tab, $ret, $indx, $sclr );
return $tab, $ret;
}
# get name of command for current organizer level
# initial capital signifies tokenized search matching nesting levels
# (tokenizing is significantly slower than regular expression pattern matching)
my $name = "";
my $capname = "";
if ( $level == 7 ) {
$name = "-division";
$capname = "-Division";
} elsif ( $level == 6) {
$name = "-group";
$capname = "-Group";
} elsif ( $level == 5) {
$name = "-branch";
$capname = "-Branch";
} elsif ( $level == 4) {
$name = "-block";
$capname = "-Block";
} elsif ( $level == 3) {
$name = "-section";
$capname = "-Section";
} elsif ( $level == 2) {
$name = "-subset";
$capname = "-Subset";
} elsif ( $level == 1) {
$name = "-unit";
$capname = "-Unit";
}
# find chains of arguments between indicated command names
my $start = 0;
my $stop = 0;
for ( $start = 0; $start < $max; $start = $stop ) {
$stop = $start + 1;
while ( $stop < $max and $args[$stop] ne $name and $args[$stop] ne $capname ) {
$stop++;
}
# collect arguments between -group, -block, or -subset directives
my @tmp = @args;
if ( $tmp[$start] eq $name and $stop - $start >= 2 ) {
my $pat = $tmp[$start + 1];
my $chd = "";
# INSDSeq special cases - &Feature and &Qualifier
if ( $pat =~ /^\&Feature$/ ) {
$pat = "INSDFeature";
} elsif ( $pat =~ /^\&Qualifier$/ ) {
$pat = "INSDQualifier";
}
if ( $pat =~ /(.+)\/(.+)/ ) {
$pat = $1;
$chd = $2;
}
$start += 2;
my @group = splice (@tmp, $start, $stop - $start);
# normal non-greedy pattern matching for non-nested XML
my @vals = ($line =~ /(<$pat(?:\s+.+?)?>.+?<\/$pat>)/g);
if ( scalar @vals > 0 ) {
my $curr = 0;
my $totl = scalar @vals;
foreach $val (@vals) {
$curr++;
( $tab, $ret ) = process_level ( $val, $pat, $chd, \@group, $tab, $ret, $level - 1, $curr, $totl );
}
} else {
@vals = ($line =~ /(<$pat(?:\s+.+?)\/>)/g);
my $curr = 0;
my $totl = scalar @vals;
foreach $val (@vals) {
$curr++;
( $tab, $ret ) = process_level ( $val, $pat, $chd, \@group, $tab, $ret, $level - 1, $curr, $totl );
}
}
} elsif ( $tmp[$start] eq $capname and $stop - $start >= 2 ) {
my $pat = $tmp[$start + 1];
my $chd = "";
if ( $pat =~ /(.+)\/(.+)/ ) {
$pat = $1;
$chd = $2;
}
$start += 2;
my @group = splice (@tmp, $start, $stop - $start);
# tokenized pattern matching for XML with nested tags
my @vals = ();
my $lvl = 0;
my $dpth = 0;
my $in_pat = false;
my @working = ();
my @tokens = split (/(?<=>)(?=<)/, $line);
foreach $tkn (@tokens) {
if ( $lvl < 0 ) {
# ignore remainder if level drops below zero
} elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
# content-containing tag does not change level
if ( $in_pat ) {
push (@working, $tkn);
} elsif ( $tkn =~ /<$pat(?:\s+.+?)?>(.+?)<\/$pat>/ ) {
# match to full object, push directly to final array
push (@vals, $tkn);
}
} elsif ( $tkn =~ /\/>$/ ) {
# self-closing token does not change level
if ( $in_pat ) {
push (@working, $tkn);
}
} elsif ( $tkn =~ /^<[^\/]/ ) {
# open tag increments level
if ( $in_pat ) {
push (@working, $tkn);
} elsif ( $tkn =~ /<$pat(?:\s+.+?)?>/ ) {
$in_pat = true;
$dpth = $lvl;
push (@working, $tkn);
}
$lvl++;
} elsif ( $tkn =~ /^<\// ) {
# close tag decrements level
$lvl--;
if ( $in_pat ) {
push (@working, $tkn);
}
if ( $tkn =~ /<\/$pat>/ ) {
if ( $lvl <= $dpth ) {
$in_pat = false;
my $str = join ("", @working);
push (@vals, $str);
@working = ();
}
}
}
}
if ( scalar @working > 0 ) {
my $str = join ("", @working);
push (@vals, $str);
@working = ();
}
my $curr = 0;
my $totl = scalar @vals;
foreach $val (@vals) {
$curr++;
( $tab, $ret ) = process_level ( $val, $pat, $chd, \@group, $tab, $ret, $level - 1, $curr, $totl );
}
} else {
my @group = splice (@tmp, $start, $stop - $start);
( $tab, $ret ) = process_level ( $line, "", "", \@group, $tab, $ret, $level - 1, $indx, $sclr );
}
}
return $tab, $ret;
}
sub process_insd {
# ... | xtract -insd [+/-] mat_peptide "%peptide" product peptide
my $quot = shift (@_);
my @args = @{shift (@_)};
my $max = scalar @args;
my @working = ();
# print common arguments
push (@working, "-pattern");
push (@working, "INSDSeq");
push (@working, "-ACCN");
push (@working, "INSDSeq_accession-version");
# collect descriptors
if ( $args[1] =~ /INSD/ ) {
push (@working, "-pfx");
push (@working, "$quot\\n$quot");
push (@working, "-element");
push (@working, "$quot\&ACCN$quot");
for ( my $i = 1; $i < $max; $i++ ) {
my $val = $args[$i];
push (@working, "-block");
push (@working, "INSDSeq");
push (@working, "-element");
push (@working, "$quot$val$quot");
}
return @working;
}
# collect qualifiers
my @qualifiers = ();
my @required = ();
my $feat = "";
my $partial = 0;
for ( my $i = 1; $i < $max; $i++ ) {
my $val = $args[$i];
if ( $feat eq "" ) {
if ( $val eq "+" or $val eq "complete" ) {
$partial = 1;
} elsif ( $val eq "-" or $val eq "partial" ) {
$partial = -1;
} else {
$feat = $val;
}
} else {
if ( $val =~ /^\+(.+)/ ) {
$val = $1;
push (@required, $val);
}
push (@qualifiers, $val);
}
}
push (@working, "-group");
push (@working, "INSDFeature");
my @itms = split (',', $feat);
my $fcmd = "-match";
foreach $itm (@itms) {
push (@working, "$fcmd");
push (@working, "INSDFeature_key:$itm");
$fcmd = "-or";
}
if ( $partial == 1 ) {
push (@working, "-avoid");
push (@working, "INSDFeature_partial5");
push (@working, "-and");
push (@working, "INSDFeature_partial3");
} elsif ( $partial == -1 ) {
push (@working, "-match");
push (@working, "INSDFeature_partial5");
push (@working, "-or");
push (@working, "INSDFeature_partial3");
}
my $rcmd = "-match";
foreach my $val (@required) {
push (@working, "$rcmd");
push (@working, "INSDQualifier_name:$val");
$rcmd = "-and";
}
push (@working, "-pfx");
push (@working, "$quot\\n$quot");
push (@working, "-element");
push (@working, "$quot&ACCN$quot");
foreach my $val (@qualifiers) {
if ( $val =~ /INSD/ ) {
push (@working, "-block");
push (@working, "INSDFeature");
push (@working, "-element");
push (@working, "$quot$val$quot");
} else {
push (@working, "-block");
push (@working, "INSDQualifier");
if ( $val =~ /^%(.+)/ ) {
$val = $1;
push (@working, "-match");
push (@working, "INSDQualifier_name:$val");
push (@working, "-element");
push (@working, "$quot\%INSDQualifier_value$quot");
} else {
push (@working, "-match");
push (@working, "INSDQualifier_name:$val");
push (@working, "-element");
push (@working, "INSDQualifier_value");
}
}
}
return @working;
}
sub process_make {
# xtract -make pubmed Author "LastName" " " "LastName,Initials" Alpher Bethe Gamow
my @args = @{shift (@_)};
my $max = scalar @args;
if ( $max < 6 ) {
print STDERR "Must supply db, block, match, sep, element, and at least one data value\n";
return;
}
my $db = $args[1];
my $obj = $args[2];
my $mat = $args[3];
my $sep = $args[4];
my $elm = $args[5];
my $pat = "";
my $uid = "";
if ( $db eq "nucleotide" ) {
$db = "nuccore";
}
if ( $db eq "pubmed" ) {
$pat = "PubmedArticle";
$uid = "MedlineCitation/PMID";
} elsif ( $db eq "nuccore" or $db eq "protein" ) {
$pat = "INSDSeq";
$uid = "INSDSeq_accession-version";
} else {
print STDERR "Currently only generates instructions for pubmed, nuccore, and protein\n";
return;
}
print "xtract -pattern $pat -element \"$uid\" \\\n";
for ( my $i = 6; $i < $max; $i++ ) {
# read items
my $itm = $args[$i];
print " -block $obj";
print " -match \"$mat:$itm\"";
print " -sep \"$sep\" -element \"$elm\"";
if ( $i == $max - 1 ) {
print " | \\\n";
print "sort -t \$'\\t' -k 2f -k 1,1n | \\\n";
print "xtract -fuse $db\n";
} else {
print " \\\n";
}
}
}
sub process_fuse {
# xtract -fuse pubmed
my @args = @{shift (@_)};
my $max = scalar @args;
if ( $max < 2 ) {
die "Need argument for database\n";
}
my $db = $args[1];
my $thisline = "";
my $laststr = "";
my $str = "";
my $uid = "";
my $uidlist = "";
my $count = 0;
my $base = "http://www.ncbi.nlm.nih.gov";
my $pfx = "";
while ($thisline = ) {
$thisline =~ s/\r//;
$thisline =~ s/\n//;
if ($thisline =~ /^([^\t]+)\t(.+)$/) {
$uid = $1;
$str = $2;
if ( lc ($str) ne lc ($laststr) and $laststr ne "" ) {
$laststr =~ s/\t/, /g;
print "( $count ) - $laststr
\n";
$pfx = "";
$count = 0;
$uidlist = "";
}
$laststr = $str;
$uidlist .= "$pfx$uid";
$pfx = ",";
$count++;
}
}
if ( $laststr ne "" ) {
$laststr =~ s/\t/, /g;
print "( $count ) - $laststr
\n";
}
}
my $xtract_help = qq{
Exploration Argument Hierarchy
-pattern (Highest Rank)
-division
-group
-branch
-block
-section
-subset
-unit (Lowest Rank)
Conditional Execution
-match Element [:value] required
-avoid Skips if element matches
-present Requires indicated data pattern
-absent Pattern must not be present
-and All tests must pass
-or Any passing test suffices
-position Must be at given location in list
Format Customization
-ret Overrides line break between patterns
-tab Replaces tab character between fields
-sep Separator between group members
-pfx Prefix to print before group
-sfx Suffix to print after group
Item Selection
-element Print all items that match tag name
-first Only prints value of first item
-last Only prints value of last item
Miscellaneous Arguments
-outline Display outline of XML structure
-synopsis Display count of unique XML paths
-format Repair XML format and indentation
-insd Generate INSDSeq extraction commands
-lbl Inserts arbitrary text
-element Construct Examples
Tag Caption
Group Initials,LastName
Parent/Child MedlineCitation/PMID
Attribute DescriptorName\@MajorTopicYN
Object Count "#Author"
Item Length "%Title"
Variable "&ACCN"
};
sub xtract {
# ... | xtract -pattern ... -group ... -block ... -subset ... -element ...
my $compress_spaces = true;
my $max = scalar @ARGV;
if ( $max > 0 and $ARGV[0] eq "-help" ) {
print "xtract $version\n";
print $xtract_help;
return;
}
if ( $max > 0 and $ARGV[0] eq "-version" ) {
print "$version\n";
return;
}
if ( $max > 0 and $ARGV[0] eq "-make" ) {
process_make ( \@ARGV );
return;
}
if ( $max > 0 and $ARGV[0] eq "-fuse" ) {
process_fuse ( \@ARGV );
return;
}
# -nocompress allows reading of BLAST XML match data with internal runs of spaces (undocumented)
if ( $max > 0 and $ARGV[0] eq "-nocompress" ) {
@ARGV = splice (@ARGV, 1, $max - 1);
$max = scalar @ARGV;
$compress_spaces = false;
}
# -insd to simplify extraction of INSDSeq qualifiers
if ( $max > 0 and $ARGV[0] eq "-insd" ) {
if ( $max < 3 and $ARGV[1] !~ /INSD/) {
print STDERR "Must supply a feature key and at least one qualifier name\n";
return;
} elsif ( -t STDIN ) {
# -insd without piped input will generate an extraction script
@ARGV = process_insd ( "\"", \@ARGV );
print "xtract ";
foreach $itm (@ARGV) {
print "$itm ";
}
print "| \\\n";
return;
} else {
# -insd with piped input will dynamically execute the extraction
@ARGV = process_insd ( "", \@ARGV );
$max = scalar @ARGV;
}
}
# recommended nested organizer levels are -group, -block, and -subset
# intermediate levels (-division, -branch, -section, and -unit) kept in reserve
# initial capitalized versions (-Group, -Block, etc.) use tokenized pattern matching
if ( $max > 1 and $ARGV[0] eq "-split" ) {
# -split reads data stream and writes chunks on separate lines
my $pat = $ARGV[1];
my $in_pat = false;
my @working = ();
while ( defined($thisline = ) ) {
$thisline =~ s/\r//g;
$thisline =~ s/\n//g;
$thisline =~ s/\t//g;
if ( $compress_spaces ) {
$thisline =~ s/ +/ /g;
}
$thisline =~ s/> +>)(?=<)/, $thisline);
foreach $tkn (@tokens) {
if ( $tkn =~ /^ +(.+)$/ ) {
$tkn = $1;
}
if ( $tkn =~ /^(.+) +$/ ) {
$tkn = $1;
}
if ( $tkn =~ /<$pat(?:\s+.+?)?>(.+?)<\/$pat>/ ) {
if ( scalar @working > 0 ) {
my $str = join ("", @working);
print "$str\n";
@working = ();
}
$in_pat = false;
print "$tkn\n";
} elsif ( $tkn =~ /<$pat(?:\s+.+?)?>/ ) {
push (@working, $tkn);
$in_pat = true;
} elsif ( $tkn =~ /<\/$pat>/ ) {
push (@working, $tkn);
$in_pat = false;
my $str = join ("", @working);
print "$str\n";
@working = ();
} elsif ( $in_pat) {
push (@working, $tkn);
}
}
}
if ( scalar @working > 0 ) {
my $str = join ("", @working);
print "$str\n";
}
return;
}
if ( $max > 0 and $ARGV[0] eq "-format" ) {
my $lvl = 0;
my $xml_ok = true;
my $doctype_ok = true;
my $root_token = "";
my $print_head = false;
my $print_tail = false;
my $xml = "";
my $doctype = "";
my $pfx = "";
my $sfx = "";
my $go_on = true;
# support -pfx, -sfx, -xml, and -doctype arguments (undocumented)
for ( my $i = 1; $i < $max; $i++ ) {
if ( $ARGV[$i] eq "-pfx" ) {
$i++;
$pfx = convert_slash ( $ARGV[$i] );
} elsif ( $ARGV[$i] eq "-sfx" ) {
$i++;
$sfx = convert_slash ( $ARGV[$i] );
} elsif ( $ARGV[$i] eq "-xml" ) {
$i++;
$xml = convert_slash ( $ARGV[$i] );
} elsif ( $ARGV[$i] eq "-doctype" ) {
$i++;
$doctype = convert_slash ( $ARGV[$i] );
}
}
while ( $go_on ) {
$thisline = "";
if ( $xml ne "" ) {
$thisline = $xml;
$xml = "";
} elsif ( $doctype ne "" ) {
$thisline = $doctype;
$doctype = "";
} elsif ( $pfx ne "" ) {
$thisline = $pfx;
$pfx = "";
} elsif ( defined($thisline = ) ) {
} elsif ( $sfx ne "" ) {
$thisline = $sfx;
$sfx = "";
} else {
$go_on = false;
}
if ( $go_on and $thisline ne "" ) {
$thisline =~ s/\r//g;
$thisline =~ s/\n//g;
$thisline =~ s/\t//g;
if ( $compress_spaces ) {
$thisline =~ s/ +/ /g;
}
$thisline =~ s/> +>)(?=<)/, $thisline);
foreach $tkn (@tokens) {
if ( $tkn =~ /^ +(.+)$/ ) {
$tkn = $1;
}
if ( $tkn =~ /^(.+) +$/ ) {
$tkn = $1;
}
if ( $lvl < 0 ) {
# ignore remainder if level drops below zero
} elsif ( $tkn =~ /^<\?xml/ ) {
if ( $xml_ok ) {
# only print first xml line
print "$tkn\n";
$xml_ok = false;
}
} elsif ( $tkn =~ /^$/ ) {
# processing instruction
$lvl++;
for ( $i = 1; $i < $lvl; $i++ ) {
print " ";
}
print "$tkn\n";
$lvl--;
} elsif ( $tkn =~ /^<\!--.+-->$/ ) {
# comment
$lvl++;
for ( $i = 1; $i < $lvl; $i++ ) {
print " ";
}
print "$tkn\n";
$lvl--;
} elsif ( $tkn =~ /^<.+?>.+?<\/(.+?)>$/ ) {
# content-containing tag
if ( $tkn =~ /^<(\S+).*?>.*>$/ ) {
$lvl++;
for ( $i = 1; $i < $lvl; $i++ ) {
print " ";
}
print "$tkn\n";
$lvl--;
}
} elsif ( $tkn =~ /^<.+?\/>$/ ) {
# self-closing token
if ( $tkn =~ /^<(\S+).*?\/>$/ ) {
$lvl++;
for ( $i = 1; $i < $lvl; $i++ ) {
print " ";
}
print "$tkn\n";
$lvl--;
}
} elsif ( $tkn =~ /^<[^\/]/ ) {
# open tag increments level
$lvl++;
if ( $tkn eq "<$root_token>" ) {
if ( $print_head ) {
print "<$root_token>\n";
$print_head = false;
$print_tail = true;
}
} else {
if ( $tkn =~ /^<(\S+).*?>$/ ) {
for ( $i = 1; $i < $lvl; $i++ ) {
print " ";
}
print "$tkn\n";
}
}
} elsif ( $tkn =~ /^<\// ) {
# close tag decrements level
if ( $tkn ne "$root_token>" ) {
for ( $i = 1; $i < $lvl; $i++ ) {
print " ";
}
print "$tkn\n";
}
$lvl--;
}
}
}
}
if ( $print_tail ) {
print "$root_token>\n";
}
return;
}
my $ret = "";
my $tab = "";
if ( $max > 2 and $ARGV[0] eq "-pipe" ) {
# -pipe reads data that has already been compressed by xtract -split
@ARGV = splice (@ARGV, 1, $max - 1);
while ( defined($thisline = ) ) {
$thisline =~ s/\r//g;
$thisline =~ s/\n//g;
$thisline =~ s/\t//g;
if ( $compress_spaces ) {
$thisline =~ s/ +/ /g;
}
$thisline =~ s/> +> 2 and $ARGV[0] eq "-pattern" ) {
# simple -pattern reads data stream and processes one pattern at a time
my $pat = $ARGV[1];
my $chd = "";
if ( $pat =~ /(.+)\/(.+)/ ) {
$pat = $1;
$chd = $2;
}
@ARGV = splice (@ARGV, 2, $max - 2);
my $in_pat = false;
my @working = ();
while ( defined($thisline = ) ) {
$thisline =~ s/\r//g;
$thisline =~ s/\n//g;
$thisline =~ s/\t//g;
if ( $compress_spaces ) {
$thisline =~ s/ +/ /g;
}
$thisline =~ s/> +>)(?=<)/, $thisline);
foreach $tkn (@tokens) {
if ( $tkn =~ /^ +(.+)$/ ) {
$tkn = $1;
}
if ( $tkn =~ /^(.+) +$/ ) {
$tkn = $1;
}
if ( $tkn =~ /<$pat(?:\s+.+?)?>/ ) {
if ( $tkn =~ /<$pat(?:\s+.+?)?>(.+?)<\/$pat>/ ) {
if ( scalar @working > 0 ) {
my $str = join ("", @working);
@working = ();
my @tmp = @ARGV;
%memory = ();
( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
print "$ret";
}
$in_pat = false;
my @tmp = @ARGV;
%memory = ();
( $tab, $ret ) = process_level ( $tkn, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
print "$ret";
} else {
push (@working, $tkn);
$in_pat = true;
}
} elsif ( $tkn =~ /<\/$pat>/ ) {
push (@working, $tkn);
$in_pat = false;
my $str = join ("", @working);
@working = ();
my @tmp = @ARGV;
%memory = ();
( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
print "$ret";
} elsif ( $in_pat) {
push (@working, $tkn);
}
}
}
if ( scalar @working > 0 ) {
my $str = join ("", @working);
@working = ();
my @tmp = @ARGV;
%memory = ();
( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
print "$ret";
}
return;
}
my $rpt = 1;
# read entire XML input stream into a single string
$holdTerminator = $/;
undef $/;
$data = ;
$/ = $holdTerminator;
# remove newlines, tabs, space between tokens, compress runs of spaces,
$data =~ s/\r//g;
$data =~ s/\n//g;
$data =~ s/\t//g;
if ( $compress_spaces ) {
$data =~ s/ +/ /g;
}
$data =~ s/> +> 2 and $ARGV[0] eq "-repeat" ) {
# first look for -repeat count for performance testing
$rpt = $ARGV[1];
@ARGV = splice (@ARGV, 2, $max - 2);
$max = scalar @ARGV;
}
if ( $ARGV[0] ne "-pattern" ) {
# no -pattern, process entire XML without looking for pattern
my @args = @ARGV;
%memory = ();
( $tab, $ret ) = process_level ( $data, "", "", \@args, "", "", $initial_level, 1, 1 );
print "$ret";
return;
}
# split by -pattern - can handle single term or parent/child (including heterogeneous parent/*)
my $ptrn = $ARGV[1];
my $chld = "";
if ( $ptrn =~ /(.+)\/(.+)/ ) {
$ptrn = $1;
$chld = $2;
}
@vals = ($data =~ /(<$ptrn(?:\s+.+?)?>.+?<\/$ptrn>)/g);
if ( $max == 2 ) {
foreach $val (@vals) {
print "$val\n";
}
return;
}
# process remaining arguments
my @args = splice (@ARGV, 2, $max - 2);
my $curr = 0;
my $totl = scalar @vals;
for ( my $idx = 0; $idx < $rpt; $idx++ ) {
foreach $val (@vals) {
%memory = ();
( $tab, $ret ) = process_level ( $val, $ptrn, $chld, \@args, "", "", $initial_level, $curr, $totl );
print "$ret";
}
}
}
# execute XML extraction
xtract ();
# if -synopsis, print cumulative summary of XML paths
if ( scalar (keys %synopsis_acc) > 0 ) {
my $spaces = 1;
if ( $synopsis_max > 9999 ) {
$spaces = 5;
} elsif ( $synopsis_max > 999 ) {
$spaces = 4;
} elsif ( $synopsis_max > 99 ) {
$spaces = 3;
} elsif ( $synopsis_max > 9 ) {
$spaces = 2;
}
my @keys;
@keys = keys %synopsis_acc;
for my $ky (sort @keys) {
$vl = $synopsis_acc{$ky};
$remaining = $spaces - length ( $vl );
print " $vl";
for ( $i = 0; $i < $remaining; $i++ ) {
print " ";
}
print " $ky\n";
}
}
# close input and output files
close (STDIN);
close (STDOUT);
close (STDERR);