/**
* '$RCSfile$'
* Copyright: 2007 Regents of the University of California and the
* National Center for Ecological Analysis and Synthesis
*
* '$Author$'
* '$Date$'
* '$Revision$'
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package edu.ucsb.nceas.metacat;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.TimerTask;
import java.net.URLEncoder;
import org.apache.log4j.Logger;
import org.apache.commons.lang.StringEscapeUtils;
import edu.ucsb.nceas.metacat.database.DBConnection;
import edu.ucsb.nceas.metacat.database.DBConnectionPool;
import edu.ucsb.nceas.metacat.properties.PropertyService;
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
/**
* A Sitemap represents a document that lists all of the content of the Metacat
* server for use by harvesting spiders that wish to index the contents of the
* Metacat site. It is used to generate an XML representation of all of the URLs
* of the site in order to facilitate indexing of the metacat site by search
* engines.
*
* Which objects are included?
*
* - Only documents with public read permission are included
* - Only documents with object_formats in the xml_catalog table are included
* - All non-obsoleted metadata objects are included in the sitemap(s)
*
* Other notes:
*
* - The sitemaps this class generates are intended to be served another
* application such as MetacatUI
* - A sitemap index is generated regardless of the number of URLs present
* - URLs for the location of the sitemaps and the entries themselves are
* controlled by the 'sitemap.location.base' and 'sitemap.entry.base'
* properties which can be full URLs or absolute paths.
*
* - sitemap.location.base controls first part of the URLs in the sitemap
* index
* - sitemap.entry.base controls the first part of the URLs in the sitemap
* files themselves
*
* @author Matt Jones
* @author Bryce Mecum
*/
public class Sitemap extends TimerTask {
private static Logger logMetacat = Logger.getLogger(Sitemap.class);
/**
* Construct a new instance of the Sitemap.
*
* @param directory
* the location to store sitemap files
* @param locationBase
* the base URL for constructing sitemap location URLs
* @param entryBase
* the base URL for constructing sitemap entry URLs
*
*/
public Sitemap(File directory, String locationBase, String entryBase) {
super();
this.directory = directory;
this.locationBase = locationBase;
this.entryBase = entryBase;
}
/**
* Execute the timed task when called, in this case by generating the
* sitemap files needed for this Metacat instance.
*/
public void run() {
generateSitemaps();
}
/**
* Generate all of the sitemap files needed to list the URLs from this
* instance of Metacat, using the open sitemap format described here:
* http://www.sitemaps.org/protocol.html
* URLs are written to a single file, unless the maximum number of URLs
* allowed in the sitemap file is exceeded, in which subsequent numbered
* files are created. An index of the sitemaps is also created.
*
* The sitemap index can be registered with search index providers such as
* Google, but beware that it needs to be accessible in a location above the
* mount point for the service URLs. By default the files are placed in
* {context}/sitemaps, but you will need to expose them at a location
* matching what's set in the sitemap.location.base and sitemap.entry.base
* properties in order to be trusted by Google. See the Sitemaps.org
* documentation for details.
*/
public void generateSitemaps() {
logMetacat.info("Running the Sitemap task. Directory is " + directory + " and locationBase is " + locationBase +".");
// Test if the passed in File is a directory
if (directory.isDirectory()) {
// Query xml_documents to get list of documents
StringBuffer query = new StringBuffer();
/** Query for documents that are:
* - Metadata (their object_format is in the xml_catalog)
* - Latest/head versions (their obsoleted_by field is NULL)
* - Publicly readable (their access policy has a public + read perm)
*/
// We use a subquery to filter documents based upon whether they use
// a format ID in the xml_catalog table
String metadata_formats =
"SELECT public_id from xml_catalog " +
"WHERE public_id is not NULL";
String entries =
"SELECT identifier.guid as pid " +
"FROM identifier " +
"LEFT JOIN systemmetadata on identifier.guid = systemmetadata.guid " +
"LEFT JOIN xml_access on identifier.guid = xml_access.guid " +
"WHERE " +
"systemmetadata.object_format in (" + metadata_formats + ") AND " +
"systemmetadata.obsoleted_by is NULL AND " +
"systemmetadata.archived = FALSE AND " +
"xml_access.principal_name = 'public' AND " +
"xml_access.perm_type = 'allow' " +
"ORDER BY systemmetadata.date_uploaded ASC;";
query.append(entries);
DBConnection dbConn = null;
int serialNumber = -1;
try {
// Get a database connection from the pool
dbConn = DBConnectionPool
.getDBConnection("Sitemap.generateSitemap()");
serialNumber = dbConn.getCheckOutSerialNumber();
// Execute the query statement
PreparedStatement stmt = dbConn.prepareStatement(query.toString());
stmt.execute();
ResultSet rs = stmt.getResultSet();
// Loop through all of the documents, and write them to a
// sitemap
File sitemapFile = null;
OutputStreamWriter sitemap = null;
int counter = 0;
int fileNumber = 0;
while (rs.next()) {
// Check if a new sitemap file needs to be created
if (counter % MAX_URLS_IN_FILE == 0) {
// if a sitemap file is already open
if (sitemapFile != null && sitemapFile.canWrite()) {
// write the footer and close the file
writeSitemapFooter(sitemap);
}
// Open a new sitemap file for writing
fileNumber++;
sitemapFile = new File(directory, fileRoot + fileNumber
+ ".xml");
sitemap = new OutputStreamWriter(new FileOutputStream(sitemapFile), Charset.forName("UTF-8"));
// Write the sitemap document header for the new file
writeSitemapHeader(sitemap);
}
writeSitemapEntry(sitemap, rs.getString(1));
counter++;
}
stmt.close();
writeSitemapFooter(sitemap);
writeSitemapIndex(fileNumber);
} catch (SQLException e) {
logMetacat.warn("Error while writing to the sitemap file: "
+ e.getMessage());
} catch (IOException ioe) {
logMetacat.warn("Could not open or write to the sitemap file."
+ ioe.getMessage());
} finally {
// Return database connection to the pool
DBConnectionPool.returnDBConnection(dbConn, serialNumber);
}
} else {
logMetacat.warn("Sitemap not created because directory not valid.");
}
}
/**
* Write the header information in a single sitemap file. This includes the
* XML prolog, the root element and namespace declaration, and the elements
* leading up to the first URL entry.
*
* @param sitemap
* the Writer to use for writing the header
* @throws IOException
* if there is a problem writing to the Writer
*/
private void writeSitemapHeader(Writer sitemap) throws IOException {
sitemap.write(PROLOG);
String header = "\n";
sitemap.write(header);
sitemap.flush();
}
/**
* Write a URL entry to a single sitemap file. This includes the XML markup
* surrounding a particular site URL.
*
* @param sitemap
* the Writer to use for writing the URL
* @param pid
* the identifier to be written in the URL
* @throws IOException
* if there is a problem writing to the Writer
*/
private void writeSitemapEntry(Writer sitemap, String pid)
throws IOException {
if (sitemap != null && pid != null && entryBase != null) {
StringBuffer url = new StringBuffer();
url.append(entryBase);
if (!entryBase.endsWith("/")) {
url.append("/");
}
// URL-encode _and_ XML escape the PID.
url.append(StringEscapeUtils.escapeXml(
URLEncoder.encode(pid, "UTF-8"))
);
sitemap.write(" ");
sitemap.write(url.toString());
sitemap.write("\n");
sitemap.flush();
}
}
/**
* Write the footer information in a single sitemap file and close the file.
* This includes the closing tag for the root element.
*
* @param sitemap
* the Writer to use for writing the footer
* @throws IOException
* if there is a problem writing to the Writer
*/
private void writeSitemapFooter(Writer sitemap) throws IOException {
if (sitemap != null)
{
String footer = "\n";
sitemap.write(footer);
sitemap.close();
}
}
/**
* Create an index file listing all of the sitemap files that were created.
* @param fileNumber the number of sitemap files that were created.
*/
private void writeSitemapIndex(int fileNumber) {
// Open a new sitemapIndex file for writing
File sitemapIndexFile = null;
OutputStreamWriter sitemapIndex = null;
sitemapIndexFile = new File(directory, indexFilename);
try {
sitemapIndex = new OutputStreamWriter(new FileOutputStream(sitemapIndexFile), Charset.forName("UTF-8"));
// Write the sitemap index header for the new file
sitemapIndex.write(PROLOG);
String header = "\n";
sitemapIndex.write(header);
sitemapIndex.flush();
// Write out one index entry for each sitemap file
for (int fn = 1; fn <= fileNumber; fn++) {
String filename = fileRoot + fileNumber + ".xml";
writeSitemapIndexEntry(sitemapIndex, filename);
}
// Write the sitemap index footer content
if (sitemapIndex != null) {
String footer = "\n";
sitemapIndex.write(footer);
sitemapIndex.close();
}
// Close the index file
if (sitemapIndex != null) {
sitemapIndex.close();
}
} catch (IOException e) {
logMetacat.warn("Could not open or write to the sitemap index file." + e.getMessage());
}
}
/**
* Write a single line of the sitemap index file containing the URL to a specific sitemap file.
* @param sitemapIndex the writer to which the index information is written
* @param filename the name of the index file to be used
* @throws IOException on error writing to the index file
*/
private void writeSitemapIndexEntry(Writer sitemapIndex, String filename)
throws IOException {
if (sitemapIndex != null && filename != null && locationBase != null) {
StringBuffer url = new StringBuffer();
url.append(locationBase);
if (!locationBase.endsWith("/")) {
url.append("/");
}
url.append(filename);
sitemapIndex.write(" \n \n ");
sitemapIndex.write(url.toString());
sitemapIndex.write("\n \n");
Date now = new Date();
SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd");
sitemapIndex.write(" "+ fmt.format(now) +"\n");
sitemapIndex.write(" \n");
sitemapIndex.flush();
}
}
// Member variables
/** The directory in which sitemaps are written. */
private File directory;
/** The root url for constructing sitemap location URLs. */
private String locationBase;
/** The root url for constructing sitemap entry URLs. */
private String entryBase;
/** Maximum number of URLs to write to a single sitemap file */
static final int MAX_URLS_IN_FILE = 50000; // 50,000 according to Google
/** The root name to be used in naming sitemap files. */
static final String fileRoot = "sitemap";
/** The name to give to the sitemap index file */
static final String indexFilename = "sitemap_index.xml";
/** A String constant containing the XML prolog to be written in files. */
static final String PROLOG = "\n";
}