#!/usr/bin/perl

# This script queries a metacat database to locate EML documents, and for each 
# document determines if there are references to data objects.  If so, those
# references are parsed and a list is generated with metadata about each object,
# such as whether the object was directly accessible or not, how many records 
# were present, and the size of the data files.

use Metacat;
use XML::DOM;
use LWP::UserAgent;
use Cache::FileCache;
use strict;

############################################################################
#
# MAIN program block
#
############################################################################

# check that the correct number or parameters are passed from the commandline
if (($#ARGV +1) != 1) {die "Usage: %./cache_eml_data.pl <metacat_url> \n\n";}
# Get the URL to the metacat server from the command line options
my ($url) = @ARGV; 

# Initialize the data cache
my $cacheDir = "/var/metacat/cache";
my $cache = initializeCache($cacheDir);

# Open a metacat connection
my $metacat = openMetacatConnection($url);

# Get a list of EML documents
#my $queryTerm = "%Jones%";
my $queryTerm = "%";

my $result = executeQuery($metacat, $queryTerm);

# Extract an array of all of the entity URLs for each EML document
my $listRef = extractEntityUrlList($result);

# Retrieve the entities, save them in the cache,  and record metadata
my $entityMetadata = cacheEntities($cache, $listRef);

# Print out the results
#printNestedArray($entityMetadata);

exit(0);

############################################################################
#
# SUBROUTINES
#
############################################################################

#
# Create a connection to the metacat server
#
sub openMetacatConnection {
    my $url = shift;

    my $metacat = Metacat->new();
    if ($metacat) {
        $metacat->set_options( metacatUrl => $url );
    } else {
        die("Could not open connection to Metacat url: $url\n");
    }
    return $metacat;
}

#
# Execute a metacat query and return the XML resultset
#
sub executeQuery {
    my $metacat = shift;
    my $queryTerm = shift;

    my $query = "<?xml version=\"1.0\" ?> <pathquery version=\"1.2\">  <querytitle>Untitled-Search-2</querytitle>  <returndoctype>-//ecoinformatics.org//eml-dataset-2.0.0beta6//EN</returndoctype> <returndoctype>-//NCEAS//eml-dataset-2.0//EN</returndoctype>  <returndoctype>eml://ecoinformatics.org/eml-2.0.0</returndoctype>  <returndoctype>eml://ecoinformatics.org/eml-2.0.1</returndoctype><returnfield>dataTable/physical/distribution/online/url</returnfield><returnfield>dataTable/entityName</returnfield><querygroup operator=\"UNION\"><queryterm searchmode=\"contains\" casesensitive=\"false\"><value>$queryTerm</value><pathexpr>surName</pathexpr></queryterm></querygroup></pathquery>";

    my $code = $metacat->squery($query);
    
    my $result =$metacat->getMessage();

    if ($result eq "") {
        print $code, "\n";
        print "Message: ", $result, "\n";
        print ("Error or timeout from metacat...");
        exit();
    }

    return $result;
}

#
# Extract the docid and entity urls for each document in the list
#
sub extractEntityUrlList {
    my $resultset = shift;

    my $parser = new XML::DOM::Parser;
    my $node;
    my $docid;
    my $doc = $parser->parse($resultset);
    my $nodes = $doc->getElementsByTagName("docid");
    my $numberNodes = $nodes->getLength;
    my @urlList;

    # Loop through each of the documents in the resultset
    for (my $i =0; $i < $numberNodes; $i++) {
        my $node = $nodes->item($i);
        $docid =  trimwhitespace($node->getFirstChild()->getNodeValue());
    
        $node = $node->getParentNode(); 
        my $tempnodes = $node->getElementsByTagName("param");
        my $tempnumberNodes = $tempnodes->getLength;
 
        my $disturl = "";
    
        # Loop through each of the "param" elements for this document
        for (my $j =0; $j < $tempnumberNodes; $j++) {
	        my $tempnode = $tempnodes->item($j);	
	        my $paramname = $tempnode->getAttributeNode("name")->getValue();
	        if ($paramname eq "dataTable/physical/distribution/online/url") {
	            $disturl = trimwhitespace(
                        $tempnode->getFirstChild()->getNodeValue());
                push(@urlList, [$docid, $disturl]);
	        }
        }
    }
    return \@urlList;
}


#
# Remove whitespace from the start and end of the string
#
sub trimwhitespace($)
{
  my $string = shift;
  $string =~ s/^\s+//;
  $string =~ s/\s+$//;
  return $string;
}

#
# Print out a nested array of arrays
#
sub printNestedArray {
    my $listRef = shift;

    for (my $i = 0; $i <= $#{$listRef}; $i++) {
        my $innerArray = $$listRef[$i];
        printArray($innerArray);
    }
}

#
# Print an array of scalars of arbitrary length, separating values with commas
#
sub printArray {
    my $innerArray = shift;
    my $innerLength = $#{$innerArray};
    for (my $i=0; $i <= $innerLength; $i++) {
        print $$innerArray[$i];
        my $delim = ($i eq $innerLength) ? "\n" : ",";
        print $delim;
    }
}

#
# For each entity in the list, try to cache the entity after downloading it
# and return information about the size of each entity
#
sub cacheEntities {
    my $cache = shift;
    my $listRef = shift;

    my @entityMetadata;

    # Create a user agent object for downloading from URLs
    my $ua = LWP::UserAgent->new;
    $ua->agent("Metacat Harvester 1.0 ");
    $ua->timeout(600);

    # Loop through all of the entity URLs
    for (my $i = 0; $i <= $#{$listRef}; $i++) {
        my $entity;
        my $entitySize;
        my $packageId = $$listRef[$i][0];
        my $entityUrl = $$listRef[$i][1];
        if ($entityUrl =~ /^ecogrid:/) {
            #print "Need to process Ecogrid uri: ", $entityUrl, "\n";
            my $dataDir = '/var/metacat/data/';
            my $pos = length("ecogrid://knb/");
            my $entityId = substr($entityUrl, $pos);
            #print "Looking for Ecogrid file: ", $dataDir . $entityId, "\n";
            my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,
                $mtime,$ctime,$blksize,$blocks) = stat($dataDir . $entityId);
            #print "Got Ecogrid size: ", $size, "\n";
            $entity = 1;
            $entitySize = $size;
        } else {
            # For regular URLs, check if its in the cache already, and use
            # it if it is.  If not, download it and save it to the cache, but
            # only if its not an HTML file (test for <html> is simplistic)
            my $entity = $cache->get( $entityUrl );
            if ( defined $entity ) {
                if ($entity =~ /<html>/) {
                    $entity = -2;
                    $entitySize = -2;
                    $cache->remove( $entityUrl );
                } else {
                    $entitySize = length($entity);
                }
            } else {
                $entity = downloadEntity($ua, $entityUrl);
                if ($entity == -1) {
                    $entitySize = -1;
                    #print("Error on download for $entityUrl\n");
                } elsif ($entity =~ /<html>/) {
                    $entity = -2;
                    $entitySize = -2;
                } else {
                    # write the data to cache, using URL as key
                    $cache->set( $entityUrl, $entity, "never" );
                    $entitySize = length($entity);
                }
            }
        }

        # Record metadata about this entity
        my $info = [$packageId, $entityUrl, 
                ($entity < 0) ? $entity : $entitySize];
        printArray($info);
        push(@entityMetadata, $info);
    }
    return \@entityMetadata;
}

#
# Download a single entity from a given URL and return it, or return -1 on error
#
sub downloadEntity {
    my $ua = shift;
    my $url = shift;

    # Create a request
    my $req = HTTP::Request->new(GET => $url);

    # Pass request to the user agent and get a response back
    my $res = $ua->request($req);
    
    # Check the outcome of the response
    if ($res->is_success) {
        return $res->content;
    } else {
        #print $res->status_line, "\n";
        return -1;
    }
}

# 
# Create a new cache to be used for storing downloaded entities
#
sub initializeCache {
    my $cacheDir = shift;

    my $cache = new Cache::FileCache( );
    $cache->set_cache_root($cacheDir);

    return $cache;
}