package R::Rdtools;
use Carp;
use Exporter;
use FileHandle;
use Text::DelimMatch;
@ISA = qw(Exporter);
@EXPORT = qw(get_section get_usages get_arglist Rdpp);
my $delimcurly = new Text::DelimMatch("\\{", "\\}");
$delimcurly->escape("\\");
my $delimround = new Text::DelimMatch("\\(", "\\)");
sub get_section {
my ($text, $section) = @_;
## remove comments
$text =~ s/([^\\])((\\\\)*)%.*\n/$1$2\n/g;
my @text = split(/\\$section\{/, " " . $text);
shift @text;
my @sections;
foreach $text (@text) {
$delimcurly->match("\{" . $text);
push(@sections, $delimcurly->matched);
}
@sections;
}
sub get_usages {
my ($text, $mode, $verbose) = @_;
## remove comments
$text =~ s/([^\\])((\\\\)*)%.*\n/$1$2\n/g;
##
## This apparently gets quoted args wrong, e.g. in read.table().
## $delimround->quote("\"");
## $delimround->quote("\'");
##
sub add_to_functions {
my($ref, $prefix, $match, $name) = @_;
if($ref->{$prefix}) {
## Multiple usages for a function are trouble.
## We could try to build the full arglist for the function
## from the usages. A simple idea is to do
## chop($ref->{$prefix});
## my $foo = ", " . substr($match, 1);
## $ref->{$prefix} .= $foo;
## which adds the 'new' args to the 'old' ones.
## This is not good enough, as it could give duplicate args
## which is not allowed. In fact, we would generally need
## an R parser for the arglist, as the usages could be as
## bad as
## foo(a = c("b", "c"), b = ...)
## foo(c = NULL, ...)
## so that splitting on ',' is not good enough.
## However, there are really only two functions with
## justified multiple usage (abline and seq), so we simply
## warn about multiple usage in case it was not shadowed by
## a \synopsis unless in mode 'args', where we can cheat.
if(($mode eq "args") || ($mode eq "style")) {
my $save_prefix = $prefix . "~";
while($ref->{$save_prefix}) {
$save_prefix .= "~";
}
$ref->{$save_prefix} = $match;
}
else {
print("Multiple usage for function $prefix in $name\n");
}
} else {
$ref->{$prefix} = $match;
}
}
sub add_to_S4methods {
##
## Could make this smarter similar to add_to_functions().
my($ref, $prefix, $match, $name) = @_;
if($ref->{$prefix}) {
print("Multiple usage for S4 method $prefix in ${name}\n");
}
else {
$ref->{$prefix} = $match;
}
##
}
my %usages;
my %functions = ();
my @variables;
my @data_sets;
my %S4methods;
my $ref_to_functions = \%functions;
my $ref_to_S4methods = \%S4methods;
my @text;
my $name;
my $maybe_is_data_set_doc = 0;
## Get the \name documented.
$name = $delimcurly->match(substr($text, index($text, "\\name")));
$name = substr($name, 1, $#name);
##
## We need to special-case documentation for data sets, or the
## data(FOO) usage for data sets overrides the docs for function
## data(). Unless Rd is extended so that there is an Rd type for
## data docs, we need to rely on heuristics. Older versions ignored
## all Rd files having 'datasets' as their only keyword. But this
## is a problem: Rd authors might use other keywords to indicate
## that a data set is useful for a certain kind of statistical
## analysis. Hence, we do the following: ignore all usages of
## data(FOO) in a file with keyword 'datasets' where FOO as only one
## argument in the sense that it does not match ','.
$maybe_is_data_set_doc = 1
if($text =~ /\\keyword\{\s*datasets\s*\}/);
##
## In 'codoc' mode, use \synopsis in case there is one, but warn
## about doing so if verbose.
@text = split(/\\synopsis/, $text) if($mode eq "codoc");
if($#text > 0) {
print "Using synopsis in '$name'\n" if($verbose);
} else {
@text = split(/\\usage/, $text);
}
shift @text;
foreach $text (@text) {
my $usage = $delimcurly->match($text);
## Get rid of leading and trailing braces.
$usage = substr($usage, 1, -1) if($usage);
## Remove R comment lines. (Removing all comments is a bit
## tricky as '#' could occur inside a string ...)
$usage =~ s/(^|\n)[\s\n]*\#.*(\n|$)/$2/g;
while($usage) {
$usage =~ s/^[\s\n]*//g;
## Try to match usage for variables (i.e., a syntactic name
## on a single line).
if($usage =~
/^([\.[:alpha:]][\.[:alnum:]]*)\s*([\#\n]|$)/) {
push(@variables, $1);
$usage =~ s/^.*(\n|$)//;
next;
}
## Try matching infix markup for '%whatever%' special binary
## operators. Note that R-lang says that 'whatever' can be
## *anything* ...
if($usage =~
/^([\.[:alnum:]]+)\s+\\\%(.+)\\\%\s+([\.[:alnum:]]+)\s*([\#\n]|$)/) {
##
## We should really have a common regexp for matching
## syntactic R names.
##
add_to_functions($ref_to_functions,
"\%" . $2 . "\%",
"($1, $3)",
$name);
$usage =~ s/^.*(\n|$)//;
next;
}
##
## Add something for matching subscripting and subassigning
## (extracting and replacing subsets): look for
## ^${R_syntax_name_RE}[?
## and use delimMatch to get the matching ']' (or ']]').
## Similarly for subscripting/subassiging via '$'.
##
## Note that we also need to deal with markup for S3 methods
## for subscripting and subassigning: unless we know what
## Rdconv() should do with e.g.
## \S3method{[}{data.frame}(ARGLIST)
## there is really not too much point in playing with this.
##
my ($generic, $class, $siglist);
## Try to match the next '(...)' arglist from $usage.
my ($prefix, $match, $rest) = $delimround->match($usage);
## Play with $prefix.
$prefix =~ s/[\s\n]*$//;
$prefix =~ s/^([\s\n\{]*)//;
$prefix =~ s/(.*\n)*//g;
$prefix =~ s/^\"//;
$prefix =~ s/\"$//;
if(!$prefix || ($prefix =~ /.+<-.+/)) {
## Ignore matches where $prefix is empty, or contains a
## '<-' in the 'middle', most likely from something like
## NAME <- FUN(ARGLIST), e.g., Random.Rd.
$rest =~ s/^.*(\n|$)//g;
$usage = $rest;
next;
}
if($prefix =~
/\\(S3)?method\{([[:alnum:].]+)\}\{([[:alnum:].]+)\}/) {
## We need a little magic for S3 replacement *methods*,
## and hence remember S3 methods markup found.
$generic = $2;
$class = $3;
if($mode eq "style") {
$prefix = "\\method{${generic}}{${class}}";
}
else {
$prefix = "${generic}.${class}";
}
}
if($prefix =~
/\\S4method\{([[:alnum:].]+)\}\{([[:alnum:].,]+)\}/) {
## We need a little magic for S4 replacement *methods*,
## and hence remember S4 methods markup found.
$generic = $1;
$siglist = $2;
$prefix = "\\S4method{${generic}}{${siglist}}";
}
## Play with $match.
$match =~ s/=\s*([,\)])/$1/g;
$match =~ s/\<\\"\<\> style
$match =~ s/\>\>/\>\>\"/g; # foo = <> style
$match =~ s/\\%/%/g; # comments
if($rest =~ /^\s*([\#\n]|$)/) {
## Note that we need to allow for R comments in the
## above regexp.
##
## Heuristics for data set documentation once more.
if($maybe_is_data_set_doc
&& ($prefix eq "data")
&& ($match !~ /\,/)) {
push(@data_sets, substr($match, 1, -1));
last;
}
##
if($siglist) {
add_to_S4methods($ref_to_S4methods,
$prefix, $match, $name);
}
else {
add_to_functions($ref_to_functions,
$prefix, $match, $name);
}
}
elsif($rest =~ /^\s*\<-\s*([[:alpha:]]+)\s*(\n|$)/) {
## Looks like documentation for a replacement function.
if($siglist) {
## Documentaion for an S4 replacement method in the
## form
## \S4method{GENERIC}{SIGLIST}(ARGLIST) <- RHS
$prefix = "\\S4method{${generic}<-}{${siglist}}";
add_to_S4methods($ref_to_S4methods,
$prefix,
substr($match, 0, -1) . ", $1)",
$name);
}
else {
if($generic) {
## Documentation for a replacement *method* in
## the form
## \method{GENERIC}{CLASS}(ARGLIST) <- RHS
## with GENERIC *without* the trailing '<-'.
## Rewrite as
## "GENERIC<-.CLASS" (ARGLIST, VALUE)
## We could also deal with
## \method{GENERIC<-}{CLASS}(ARGLIST) <- RHS
## here but Rdconv() would not get this right
## for the time being ...
if($mode eq "style") {
$prefix =
"\\method{${generic}<-}{${class}}";
}
else {
$prefix = "${generic}<-.${class}";
}
} else {
$prefix = "${prefix}<-";
}
add_to_functions($ref_to_functions,
$prefix,
substr($match, 0, -1) . ", $1)",
$name);
}
}
$rest =~ s/^.*(\n|$)//g;
$usage = $rest;
}
}
@{$usages{"functions"}} = %functions;
@{$usages{"variables"}} = @variables;
@{$usages{"data_sets"}} = @data_sets;
@{$usages{"S4methods"}} = %S4methods;
%usages;
}
sub get_arglist {
## Get the list of all documented arguments, i.e., the first
## arguments from the top-level \item{}{}s in section \arguments,
## split on ','.
my ($text) = @_;
my @args = ();
my @keywords = get_section($text, "keyword");
foreach my $keyword (@keywords) {
return ("*internal*") if($keyword =~ /^\{\s*internal\s*\}$/);
}
my @chunks = get_section($text, "arguments");
foreach my $chunk (@chunks) {
my ($prefix, $match);
my $rest = substr($chunk, 1, -1);
while($rest) {
## Try matching top-level \item{}{}.
($prefix, $match, $rest) = $delimcurly->match($rest);
if($prefix =~ /\\item$/) {
## If successful, $match contains the first argument to
## the \item enclosed by the braces.
$match =~ s/\\dots/.../g;
$match =~ s/\n/ /g;
push(@args, split(/\,\s*/, substr($match, 1, -1)));
} else {
break;
}
## Strip off the second argument to \item.
($prefix, $match, $rest) = $delimcurly->match($rest);
}
}
@args;
}
sub Rdpp {
my ($file, $OS) = @_;
my $fh = new FileHandle "< $file" or croak "open($file): $!\n";
my $skip_level;
my @skip_state;
my $skip;
my $text;
$OS = "unix" unless $OS;
while(<$fh>) {
if (/^#ifdef\s+([A-Za-z0-9]+)/o) {
$skip = $1 ne $OS;
$skip_level += $skip;
push(@skip_state, $skip);
next;
}
if (/^#ifndef\s+([A-Za-z0-9]+)/o) {
$skip = $1 eq $OS;
$skip_level += $skip;
push(@skip_state, $skip);
next;
}
if (/^#endif/o) {
$skip_level -= pop(@skip_state);
next;
}
next if $skip_level > 0;
next if /^\s*%/o;
$text .= $_;
}
close($fh);
$text;
}
1;