addresses #103, figure out how to store captured pages in native export format

import/export of file data should work for all file types _except_ snapshots (in this situation, export is working, but import is not yet complete; see #193)
also, fixes a potential security issue that could have allowed malicious web translators to post local data to remote sites (although, given we maintain the central repository and there's no easy way to install a translator, the risk would have been minimal to begin with).
This commit is contained in:
Simon Kornblith 2006-08-18 05:58:14 +00:00
parent 10ba568ee8
commit 20486d5053
4 changed files with 343 additions and 24 deletions

View file

@ -52,7 +52,7 @@ var Scholar_File_Interface_Export = new function() {
var defValue = _options[option];
var element = document.getElementById(option);
if(typeof(defValue) == "bool") {
if(typeof(defValue) == "boolean") {
if(element.checked == "true") {
_options[option] = true;
} else {

View file

@ -1,6 +1,4 @@
// Scholar for Firefox Translate
// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
// This code is licensed according to the GPL
// Scholar for Firefox Translate Engine
/*
* Scholar.Translate: a class for translation of Scholar metadata from and to
@ -66,6 +64,7 @@
* returned items
* _storageStream - the storage stream to be used, if one is configured
* _storageStreamLength - the length of the storage stream
* _exportFileDirectory - the directory to which files will be exported
*
* WEB-ONLY PRIVATE PROPERTIES:
*
@ -648,7 +647,7 @@ Scholar.Translate.prototype._parseDetectCode = function(translator) {
*
* dataMode
* valid: import, export
* options: rdf, text
* options: rdf, block, line
* purpose: selects whether write/read behave as standard text functions or
* using Mozilla's built-in support for RDF data sources
*
@ -669,6 +668,7 @@ Scholar.Translate.prototype._configure = function(option, value) {
*
* called as addOption() in detect code
*
* current options are exportNotes and exportFileData
*/
Scholar.Translate.prototype._addOption = function(option, value) {
this._displayOptions[option] = value;
@ -796,6 +796,45 @@ Scholar.Translate.prototype._closeStreams = function() {
this._streams = new Array();
}
/*
* imports an attachment from the disk
*/
Scholar.Translate.prototype._itemImportAttachment = function(attachment, sourceID) {
Scholar.debug(attachment);
if(!attachment.path) {
// create from URL
if(attachment.url) {
var attachmentID = Scholar.Attachments.linkFromURL(attachment.url, sourceID,
(attachment.mimeType ? attachment.mimeType : undefined),
(attachment.title ? attachment.title : undefined));
} else {
Scholar.debug("not adding attachment: no path or url specified");
}
} else {
if(attachment.url) {
Scholar.debug("not adding attachment: snapshot import not yet implemented");
} else {
// generate nsIFile
var IOService = Components.classes["@mozilla.org/network/io-service;1"].
getService(Components.interfaces.nsIIOService);
var uri = IOService.newURI(attachment.path, "", null);
var file = uri.QueryInterface(Components.interfaces.nsIFileURL).file;
// import from nsIFile
var attachmentID = Scholar.Attachments.importFromFile(file, sourceID);
// get attachment item
var myAttachmentItem = Scholar.Items.get(attachmentID);
if(attachment.title) {
// set title
myAttachmentItem.setField("title", attachment.title);
}
}
}
return attachmentID;
}
/*
* executed when an item is done and ready to be loaded into the database
*/
@ -833,6 +872,8 @@ Scholar.Translate.prototype._itemDone = function(item) {
var myID = Scholar.Notes.add(item.note);
// re-retrieve the item
var newItem = Scholar.Items.get(myID);
} else if(type == "attachment") {
var myID = this._itemImportAttachment(item, null);
} else {
// create new item
var typeID = Scholar.ItemTypes.getID(type);
@ -911,9 +952,11 @@ Scholar.Translate.prototype._itemDone = function(item) {
// handle attachments
if(item.attachments) {
for each(var attachment in item.attachments) {
if(!attachment.url && (this.type != "web" || !attachment.document)) {
Scholar.debug("not adding attachment: no URL specified");
} else if(this.type == "web") {
if(this.type == "web") {
if(!attachment.url && !attachment.document) {
Scholar.debug("not adding attachment: no URL specified");
}
if(attachment.downloadable && this._downloadAssociatedFiles) {
if(attachment.document) {
var attachmentID = Scholar.Attachments.importFromDocument(attachment.document, myID);
@ -925,9 +968,7 @@ Scholar.Translate.prototype._itemDone = function(item) {
attachmentItem.setField("title", attachment.title);
}
} else {
Scholar.Attachments.importFromURL(attachment.url, myID,
(attachment.mimeType ? attachment.mimeType : undefined),
(attachment.title ? attachment.title : undefined));
Scholar.Attachments.importFromURL(attachment.url, myID);
}
} else {
if(attachment.document) {
@ -945,7 +986,7 @@ Scholar.Translate.prototype._itemDone = function(item) {
}
}
} else if(this.type == "import") {
// TODO
this._itemImportAttachment(attachment, myID);
}
}
}
@ -1173,7 +1214,6 @@ Scholar.Translate.prototype._importConfigureIO = function() {
* does the actual export, after code has been loaded and parsed
*/
Scholar.Translate.prototype._export = function() {
this._exportConfigureIO();
// get items
if(this.items) {
@ -1181,6 +1221,7 @@ Scholar.Translate.prototype._export = function() {
} else {
this._itemsLeft = Scholar.getItems();
}
// run handler for items available
this._runHandler("itemCount", this._itemsLeft.length);
@ -1189,6 +1230,45 @@ Scholar.Translate.prototype._export = function() {
this._collectionsLeft = Scholar.getCollections();
}
Scholar.debug(this._displayOptions);
// export file data, if requested
if(this._displayOptions["exportFileData"]) {
// generate directory
var directory = Components.classes["@mozilla.org/file/local;1"].
createInstance(Components.interfaces.nsILocalFile);
directory.initWithFile(this.location.parent);
// get name
var name = this.location.leafName;
var extensionMatch = /^(.*)\.[a-zA-Z0-9]+$/
var m = extensionMatch.exec(name);
if(m) {
name = m[0];
}
directory.append(name);
// create directory
directory.create(Components.interfaces.nsIFile.DIRECTORY_TYPE, 0700);
// generate a new location
var originalName = this.location.leafName;
this.location = Components.classes["@mozilla.org/file/local;1"].
createInstance(Components.interfaces.nsILocalFile);
this.location.initWithFile(directory);
this.location.append(originalName);
// create files directory
this._exportFileDirectory = Components.classes["@mozilla.org/file/local;1"].
createInstance(Components.interfaces.nsILocalFile);
this._exportFileDirectory.initWithFile(directory);
this._exportFileDirectory.append("files");
this._exportFileDirectory.create(Components.interfaces.nsIFile.DIRECTORY_TYPE, 0700);
}
// configure IO
this._exportConfigureIO();
try {
this._sandbox.doExport();
} catch(e) {
@ -1229,14 +1309,98 @@ Scholar.Translate.prototype._exportConfigureIO = function() {
}
}
/*
* copies attachment and returns data, given an attachment object
*/
Scholar.Translate.prototype._exportGetAttachment = function(attachment) {
var attachmentArray = new Object();
var attachmentID = attachment.getID();
var linkMode = attachment.getAttachmentLinkMode();
// get url if one exists
if(linkMode == Scholar.Attachments.LINK_MODE_LINKED_URL ||
linkMode == Scholar.Attachments.LINK_MODE_IMPORTED_URL) {
var url = attachment.getURL()
attachmentArray.url = url;
} else if(!this._displayOptions["exportFileData"]) {
// only export urls, not files, if exportFileData is off
return false;
}
// add item ID
attachmentArray.itemID = attachmentID;
// get title
attachmentArray.title = attachment.getField("title");
// get mime type
attachmentArray.mimeType = attachment.getAttachmentMimeType();
if(linkMode != Scholar.Attachments.LINK_MODE_LINKED_URL &&
this._displayOptions["exportFileData"]) {
// add path and filename if not an internet link
attachmentArray.path = "files/"+attachmentID+"/";
var file = attachment.getFile();
attachmentArray.filename = file.leafName;
if(linkMode == Scholar.Attachments.LINK_MODE_LINKED_FILE) {
// create a new directory
var directory = Components.classes["@mozilla.org/file/local;1"].
createInstance(Components.interfaces.nsILocalFile);
directory.initWithFile(this._exportFileDirectory);
directory.append(attachmentID);
directory.create(Components.interfaces.nsIFile.DIRECTORY_TYPE, 0700);
// copy file
file.copyTo(directory, attachmentArray.filename);
} else {
// copy imported files from the Scholar directory
var directory = Scholar.getStorageDirectory();
directory.append(attachmentID);
directory.copyTo(this._exportFileDirectory, attachmentID);
}
}
Scholar.debug(attachmentArray);
return attachmentArray;
}
/*
* gets the next item to process (called as Scholar.nextItem() from code)
*/
Scholar.Translate.prototype._exportGetItem = function() {
if(this._itemsLeft.length != 0) {
var returnItem = this._itemsLeft.shift();
// skip files if exportFileData is off, or if the file isn't standalone
if(returnItem.isAttachment() &&
(!this._displayOptions["exportFileData"] ||
returnItem.getSource())) {
return this._exportGetItem();
}
// export file data for single files
if(returnItem.isAttachment()) { // an independent attachment
var returnItemArray = this._exportGetAttachment(returnItem);
returnItemArray.itemType = "attachment";
return returnItemArray;
} else {
var returnItemArray = returnItem.toArray();
// get attachments, although only urls will be passed if exportFileData
// is off
returnItemArray.attachments = new Array();
var attachments = returnItem.getAttachments();
for each(attachmentID in attachments) {
var attachment = Scholar.Items.get(attachmentID);
var attachmentInfo = this._exportGetAttachment(attachment);
if(attachmentInfo) {
returnItemArray.attachments.push(attachmentInfo);
}
}
}
this._runHandler("itemDone", returnItem);
return returnItem.toArray();
return returnItemArray;
}
return false;

View file

@ -254,10 +254,19 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) {
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
this.processDocuments([ url ], succeeded, null, failed);
}
Scholar.Utilities.Ingester._protocolRe = new RegExp();
Scholar.Utilities.Ingester._protocolRe.compile("^(?:(?:http|https|ftp):|[^:]*/)", "i");
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
if(this.translate.locationIsProxied) {
for(i in urls) {
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
if(this.translate.locationIsProxied) {
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
}
// check for a protocol colon
if(!Scholar.Utilities.Ingester._protocolRe.test(uris[i])) {
throw("invalid URL in processDocuments");
}
}
}
@ -282,6 +291,9 @@ Scholar.Utilities.Ingester.HTTP.prototype.doGet = function(url, onDone) {
if(this.translate.locationIsProxied) {
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
}
if(!Scholar.Utilities.Ingester._protocolRe.test(url)) {
throw("invalid URL in processDocuments");
}
var translate = this.translate;
Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) {
@ -298,6 +310,9 @@ Scholar.Utilities.Ingester.HTTP.prototype.doPost = function(url, body, onDone) {
if(this.translate.locationIsProxied) {
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
}
if(!Scholar.Utilities.Ingester._protocolRe.test(url)) {
throw("invalid URL in processDocuments");
}
var translate = this.translate;
Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) {

View file

@ -1,4 +1,4 @@
-- 50
-- 51
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
@ -3548,12 +3548,39 @@ function generateCollection(collection) {
Scholar.RDF.addStatement(collectionResource, n.dcterms+"hasPart", "#collection:"+child.id, false);
// do recursive processing of collections
generateCollection(child);
} else {
} else if(itemResources[child.id]) {
Scholar.RDF.addStatement(collectionResource, n.dcterms+"hasPart", itemResources[child.id], false);
}
}
}
function handleAttachment(attachmentResource, attachment) {
Scholar.RDF.addStatement(attachmentResource, rdf+"type", n.fs+"File", false);
if(attachment.url) {
// add url as identifier
var term = Scholar.RDF.newResource();
// set term type
Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"URI", false);
// set url value
Scholar.RDF.addStatement(term, rdf+"value", attachment.url, true);
// add relationship to resource
Scholar.RDF.addStatement(attachmentResource, n.dc+"identifier", term, false);
}
// add mime type
var term = Scholar.RDF.newResource();
// set term type
Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"IMT", false);
// set mime type value
Scholar.RDF.addStatement(term, rdf+"value", attachment.mimeType, true);
// add relationship to resource
Scholar.RDF.addStatement(attachmentResource, n.dc+"format", term, false);
// add title
Scholar.RDF.addStatement(attachmentResource, n.dc+"title", attachment.title, true);
}
function doExport() {
rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
@ -3563,7 +3590,8 @@ function doExport() {
dcterms:"http://purl.org/dc/terms/",
prism:"http://prismstandard.org/namespaces/1.2/basic/",
foaf:"http://xmlns.com/foaf/0.1/",
vcard:"http://nwalsh.com/rdf/vCard"
vcard:"http://nwalsh.com/rdf/vCard#",
fs:"http://chnm.gmu.edu/firefoxscholar/rdf#"
};
// add namespaces
@ -3584,7 +3612,10 @@ function doExport() {
while(item = Scholar.nextItem()) {
items.push(item);
if(item.ISBN && !usedResources["urn:isbn:"+item.ISBN]) {
if(item.itemType == "attachment" && item.path) {
// file is stored locally (paths are always unique)
itemResources[item.itemID] = item.path+item.filename;
} else if(item.ISBN && !usedResources["urn:isbn:"+item.ISBN]) {
itemResources[item.itemID] = "urn:isbn:"+item.ISBN;
usedResources[itemResources[item.itemID]] = true;
} else if(item.url && !usedResources[item.url]) {
@ -3598,6 +3629,20 @@ function doExport() {
for(var j in item.notes) {
itemResources[item.notes[j].itemID] = "#item:"+item.notes[j].itemID;
}
for each(var attachment in item.attachments) {
if(attachment.path) {
// file is stored locally (paths are always unique)
itemResources[attachment.itemID] = attachment.path+attachment.filename;
} else if(!usedResources[attachment.url]) {
// file is referenced via url, and no other item has this url
itemResources[attachment.itemID] = attachment.url;
usedResources[attachment.url] = true;
} else {
// just specify a node ID
itemResources[attachment.itemID] = "#item:"+attachment.itemID;
}
}
}
for each(item in items) {
@ -3650,6 +3695,9 @@ function doExport() {
if(!Scholar.getOption("exportNotes")) {
continue;
}
} else if(item.itemType == "attachment") {
handleAttachment(resource, item);
continue;
}
if(type) {
Scholar.RDF.addStatement(resource, rdf+"type", n.bib+type, false);
@ -3692,6 +3740,18 @@ function doExport() {
Scholar.RDF.addStatement(resource, n.dc+"source", item.source, true);
}
// url
if(item.url) {
// add url as identifier
var term = Scholar.RDF.newResource();
// set term type
Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"URI", false);
// set url value
Scholar.RDF.addStatement(term, rdf+"value", attachment.url, true);
// add relationship to resource
Scholar.RDF.addStatement(resource, n.dc+"identifier", term, false);
}
// accessionNumber as generic ID
if(item.accessionNumber) {
Scholar.RDF.addStatement(resource, n.dc+"identifier", item.accessionNumber, true);
@ -3745,7 +3805,7 @@ function doExport() {
}
// publication gets linked to container via isPartOf
if(item.publication) {
if(item.publicationTitle) {
Scholar.RDF.addStatement((containerElement ? containerElement : resource), n.dc+"title", item.publicationTitle, true);
}
@ -3860,6 +3920,14 @@ function doExport() {
}
}
/** FILES **/
for each(var attachment in item.attachments) {
var attachmentResource = itemResources[attachment.itemID];
Scholar.RDF.addStatement(resource, n.dc+"relation", attachmentResource, false);
handleAttachment(attachmentResource, attachment);
}
/** TAGS **/
for(var j in item.tags) {
@ -4048,6 +4116,54 @@ function handleCreators(newItem, creators, creatorType) {
}
}
// gets attachment info
function handleAttachment(node, attachment) {
if(!attachment) {
attachment = new Array();
}
attachment.title = getFirstResults(node, [n.dc+"title"], true);
var identifiers = getFirstResults(node, [n.dc+"identifier"]);
for each(var identifier in identifiers) {
if(typeof(identifier) != "string") {
var identifierType = Scholar.RDF.getTargets(identifier, rdf+"type");
if(identifierType) {
identifierType = Scholar.RDF.getResourceURI(identifierType[0]);
if(identifierType == n.dcterms+"URI") { // uri is url
attachment.url = getFirstResults(identifier, [rdf+"value"], true);
}
}
}
}
var formats = getFirstResults(node, [n.dc+"format"]);
for each(var format in formats) {
if(typeof(format) != "string") {
var formatType = Scholar.RDF.getTargets(format, rdf+"type");
if(formatType) {
formatType = Scholar.RDF.getResourceURI(formatType[0]);
if(formatType == n.dcterms+"IMT") { // uri is url
attachment.mimeType = getFirstResults(format, [rdf+"value"], true);
}
}
}
}
var stringNode = node;
if(typeof(stringNode) != "string") {
stringNode = Scholar.RDF.getResourceURI(stringNode);
}
if(stringNode.substr(0, 8) == "file:///") {
// not a protocol specifier; we have a path name
attachment.path = stringNode;
}
return attachment;
}
// processes collections recursively
function processCollection(node, collection) {
if(!collection) {
@ -4104,7 +4220,8 @@ function doImport() {
dcterms:"http://purl.org/dc/terms/",
prism:"http://prismstandard.org/namespaces/1.2/basic/",
foaf:"http://xmlns.com/foaf/0.1/",
vcard:"http://nwalsh.com/rdf/vCard"
vcard:"http://nwalsh.com/rdf/vCard#",
fs:"http://chnm.gmu.edu/firefoxscholar/rdf#"
};
callNumberTypes = [
@ -4165,8 +4282,6 @@ function doImport() {
} else if(type == n.bib+"Memo") {
// check to see if this note is independent
var arcs = Scholar.RDF.getArcsIn(node);
Scholar.Utilities.debug("working on a note");
Scholar.Utilities.debug(arcs);
var skip = false;
for each(var arc in arcs) {
arc = Scholar.RDF.getResourceURI(arc);
@ -4184,6 +4299,19 @@ function doImport() {
// skip collections until all the items are done
collections.push(node);
continue;
} else if(type == n.fs+"File") {
// check to see if file is independent
var arcs = Scholar.RDF.getArcsIn(node);
if(arcs.length) {
continue;
}
// process as file
newItem.itemType = "attachment";
handleAttachment(node, newItem);
Scholar.Utilities.debug(newItem);
newItem.complete();
continue;
} else { // default to book
newItem.itemType = "book";
}
@ -4361,13 +4489,25 @@ function doImport() {
}
}
}
/* ATTACHMENTS */
var relations = getFirstResults(node, [n.dc+"relation"]);
for each(var relation in relations) {
var type = Scholar.RDF.getTargets(relation, rdf+"type");
if(type) {
type = Scholar.RDF.getResourceURI(type[0]);
if(type == n.fs+"File") {
newItem.attachments.push(handleAttachment(relation));
}
}
}
newItem.complete();
}
/* COLLECTIONS */
for each(collection in collections) {
for each(var collection in collections) {
if(!Scholar.RDF.getArcsIn(collection)) {
var newCollection = new Scholar.Collection();
processCollection(collection, newCollection);