closes #103, figure out how to store captured pages in native export format

fixes ampersands in citation COinS
fixes tags and seeAlso in import/export (should now work for all items)
This commit is contained in:
Simon Kornblith 2006-08-20 04:35:04 +00:00
parent 217636ad14
commit 04d05548b2
4 changed files with 180 additions and 133 deletions

View file

@ -53,14 +53,13 @@ var Scholar_File_Interface_Export = new function() {
var element = document.getElementById(option);
if(typeof(defValue) == "boolean") {
if(element.checked == "true") {
if(element.checked == true) {
_options[option] = true;
} else {
_options[option] = false;
}
}
}
Scholar.debug(_options);
}
/*

View file

@ -202,7 +202,7 @@ CSL.prototype.createBibliography = function(items, format) {
// add line feeds
if(format == "HTML") {
var coins = Scholar.OpenURL.createContextObject(item, "1.0");
string += '<span class="Z3988" title="'+coins+'"></span>';
string += '<span class="Z3988" title="'+coins.replace("&", "&amp;")+'"></span>';
if(this._class == "note") {
output += "<li>"+string+"</li>\r\n";

View file

@ -916,39 +916,64 @@ Scholar.Translate.prototype._closeStreams = function() {
* imports an attachment from the disk
*/
Scholar.Translate.prototype._itemImportAttachment = function(attachment, sourceID) {
Scholar.debug(attachment);
if(!attachment.path) {
// create from URL
if(attachment.url) {
var attachmentID = Scholar.Attachments.linkFromURL(attachment.url, sourceID,
(attachment.mimeType ? attachment.mimeType : undefined),
(attachment.title ? attachment.title : undefined));
var attachmentItem = Scholar.Items.get(attachmentID);
} else {
Scholar.debug("not adding attachment: no path or url specified");
return false;
}
} else {
// generate nsIFile
var IOService = Components.classes["@mozilla.org/network/io-service;1"].
getService(Components.interfaces.nsIIOService);
var uri = IOService.newURI(attachment.path, "", null);
var file = uri.QueryInterface(Components.interfaces.nsIFileURL).file;
if(attachment.url) {
Scholar.debug("not adding attachment: snapshot import not yet implemented");
// import from nsIFile
var attachmentID = Scholar.Attachments.importSnapshotFromFile(file,
attachment.url, attachment.title, attachment.mimeType,
(attachment.charset ? attachment.charset : null), sourceID);
var attachmentItem = Scholar.Items.get(attachmentID);
} else {
// generate nsIFile
var IOService = Components.classes["@mozilla.org/network/io-service;1"].
getService(Components.interfaces.nsIIOService);
var uri = IOService.newURI(attachment.path, "", null);
var file = uri.QueryInterface(Components.interfaces.nsIFileURL).file;
// import from nsIFile
var attachmentID = Scholar.Attachments.importFromFile(file, sourceID);
// get attachment item
var myAttachmentItem = Scholar.Items.get(attachmentID);
var attachmentItem = Scholar.Items.get(attachmentID);
if(attachment.title) {
// set title
myAttachmentItem.setField("title", attachment.title);
attachmentItem.setField("title", attachment.title);
}
}
}
return attachmentID;
return attachmentItem;
}
/*
* handles tags and see also data for notes and attachments
*/
Scholar.Translate.prototype._itemTagsAndSeeAlso = function(item, newItem) {
Scholar.debug("handling notes and see also");
// add to ID map
if(item.itemID) {
this._IDMap[item.itemID] = newItem.getID();
}
// add see alsos
for each(var seeAlso in item.seeAlso) {
if(this._IDMap[seeAlso]) {
newItem.addSeeAlso(this._IDMap[seeAlso]);
}
}
for each(var tag in item.tags) {
newItem.addTag(tag);
}
}
/*
@ -968,9 +993,6 @@ Scholar.Translate.prototype._itemDone = function(item) {
return;
}
if(!item.title) {
throw("item has no title");
}
var notifierStatus = Scholar.Notifier.isEnabled();
if(notifierStatus) {
@ -985,8 +1007,18 @@ Scholar.Translate.prototype._itemDone = function(item) {
// re-retrieve the item
var newItem = Scholar.Items.get(myID);
} else if(type == "attachment") {
var myID = this._itemImportAttachment(item, null);
if(this.type == "import") {
var newItem = this._itemImportAttachment(item, null);
var myID = newItem.getID();
} else {
Scholar.debug("discarding standalone attachment");
return false;
}
} else {
if(!item.title) {
throw("item has no title");
}
// create new item
var typeID = Scholar.ItemTypes.getID(type);
var newItem = Scholar.Items.getNewItemByType(typeID);
@ -1016,13 +1048,11 @@ Scholar.Translate.prototype._itemDone = function(item) {
}
} else if(i == "title") { // skip checks for title
newItem.setField(i, data);
} else if(i == "tags") { // add tags
for(var j in data) {
newItem.addTag(data[j]);
}
} else if(i == "seeAlso") {
newItem.translateSeeAlso = data;
} else if(i != "note" && i != "notes" && i != "itemID" && (fieldID = Scholar.ItemFields.getID(i))) {
} else if(i != "note" && i != "notes" && i != "itemID" &&
i != "attachments" && i != "tags" &&
(fieldID = Scholar.ItemFields.getID(i))) {
// if field is in db
if(Scholar.ItemFields.isValidForType(fieldID, typeID)) {
// if field is valid for this type
@ -1049,15 +1079,8 @@ Scholar.Translate.prototype._itemDone = function(item) {
var noteID = Scholar.Notes.add(note.note, myID);
// handle see also
if(note.seeAlso) {
var myNote = Scholar.Items.get(noteID);
for each(var seeAlso in note.seeAlso) {
if(this._IDMap[seeAlso]) {
myNote.addSeeAlso(this._IDMap[seeAlso]);
}
}
}
var myNote = Scholar.Items.get(noteID);
this._itemTagsAndSeeAlso(note, myNote);
}
}
@ -1071,7 +1094,7 @@ Scholar.Translate.prototype._itemDone = function(item) {
if(attachment.downloadable && this._downloadAssociatedFiles) {
if(attachment.document) {
var attachmentID = Scholar.Attachments.importFromDocument(attachment.document, myID);
attachmentID = Scholar.Attachments.importFromDocument(attachment.document, myID);
// change title, if a different one was specified
if(attachment.title && (!attachment.document.title
@ -1086,7 +1109,7 @@ Scholar.Translate.prototype._itemDone = function(item) {
}
} else {
if(attachment.document) {
var attachmentID = Scholar.Attachments.linkFromURL(attachment.document.location.href, myID,
attachmentID = Scholar.Attachments.linkFromURL(attachment.document.location.href, myID,
(attachment.mimeType ? attachment.mimeType : attachment.document.contentType),
(attachment.title ? attachment.title : attachment.document.title));
} else {
@ -1094,13 +1117,16 @@ Scholar.Translate.prototype._itemDone = function(item) {
Scholar.debug("notice: either mimeType or title is missing; attaching file will be slower");
}
var attachmentID = Scholar.Attachments.linkFromURL(attachment.url, myID,
attachmentID = Scholar.Attachments.linkFromURL(attachment.url, myID,
(attachment.mimeType ? attachment.mimeType : undefined),
(attachment.title ? attachment.title : undefined));
}
}
} else if(this.type == "import") {
this._itemImportAttachment(attachment, myID);
var attachmentItem = this._itemImportAttachment(attachment, myID);
if(attachmentItem) {
this._itemTagsAndSeeAlso(attachment, attachmentItem);
}
}
}
}
@ -1120,6 +1146,12 @@ Scholar.Translate.prototype._itemDone = function(item) {
}
}
if(item.tags) {
for each(var tag in item.tags) {
newItem.addTag(tag);
}
}
delete item;
// only re-enable if notifier was enabled at the beginning of scraping
@ -1358,7 +1390,7 @@ Scholar.Translate.prototype._export = function() {
var extensionMatch = /^(.*)\.[a-zA-Z0-9]+$/
var m = extensionMatch.exec(name);
if(m) {
name = m[0];
name = m[1];
}
directory.append(name);
@ -1447,13 +1479,18 @@ Scholar.Translate.prototype._exportGetAttachment = function(attachment) {
attachmentArray.title = attachment.getField("title");
// get mime type
attachmentArray.mimeType = attachment.getAttachmentMimeType();
// get charset
attachmentArray.charset = attachment.getAttachmentCharset();
// get seeAlso
attachmentArray.seeAlso = attachment.getSeeAlso();
// get tags
attachmentArray.tags = attachment.getTags();
if(linkMode != Scholar.Attachments.LINK_MODE_LINKED_URL &&
this._displayOptions["exportFileData"]) {
// add path and filename if not an internet link
attachmentArray.path = "files/"+attachmentID+"/";
var file = attachment.getFile();
attachmentArray.filename = file.leafName;
attachmentArray.path = "files/"+attachmentID+"/"+file.leafName;
if(linkMode == Scholar.Attachments.LINK_MODE_LINKED_FILE) {
// create a new directory
@ -1847,7 +1884,7 @@ Scholar.Translate.RDF.prototype.getSources = function(resource, property) {
property = this._getResource(property);
resource = this._getResource(resource);
var enumerator = this._dataSource.GetSources(resource, property, true);
var enumerator = this._dataSource.GetSources(property, resource, true);
return this._deEnumerate(enumerator);
}

View file

@ -1,4 +1,4 @@
-- 53
-- 54
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
@ -3721,10 +3721,18 @@ REPLACE INTO "translators" VALUES ('14763d24-8ba0-45df-8f52-b8d1108e7ac9', '2006
'Scholar.configure("getCollections", true);
Scholar.configure("dataMode", "rdf");
Scholar.addOption("exportNotes", true);
Scholar.addOption("exportFileData", true);',
Scholar.addOption("exportFileData", false);',
'function generateSeeAlso(resource, seeAlso) {
for(var i in seeAlso) {
Scholar.RDF.addStatement(resource, n.dc+"relation", itemResources[seeAlso[i]], false);
if(itemResources[seeAlso[i]]) {
Scholar.RDF.addStatement(resource, n.dc+"relation", itemResources[seeAlso[i]], false);
}
}
}
function generateTags(resource, tags) {
for(var j in tags) {
Scholar.RDF.addStatement(resource, n.dc+"subject", tags[j], true);
}
}
@ -3746,7 +3754,11 @@ function generateCollection(collection) {
}
function handleAttachment(attachmentResource, attachment) {
Scholar.RDF.addStatement(attachmentResource, rdf+"type", n.fs+"File", false);
Scholar.RDF.addStatement(attachmentResource, rdf+"type", n.fs+"Attachment", false);
if(attachment.path) {
Scholar.RDF.addStatement(attachmentResource, rdf+"resource", attachment.path, false);
}
if(attachment.url) {
// add url as identifier
@ -3759,17 +3771,17 @@ function handleAttachment(attachmentResource, attachment) {
Scholar.RDF.addStatement(attachmentResource, n.dc+"identifier", term, false);
}
// add mime type
var term = Scholar.RDF.newResource();
// set term type
Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"IMT", false);
// set mime type value
Scholar.RDF.addStatement(term, rdf+"value", attachment.mimeType, true);
// add relationship to resource
Scholar.RDF.addStatement(attachmentResource, n.dc+"format", term, false);
Scholar.RDF.addStatement(attachmentResource, n.link+"type", attachment.mimeType, true);
// set charset value
if(attachment.charset) {
Scholar.RDF.addStatement(attachmentResource, n.link+"charset", attachment.charset, true);
}
// add title
Scholar.RDF.addStatement(attachmentResource, n.dc+"title", attachment.title, true);
// Add see also info to RDF
generateSeeAlso(attachmentResource, attachment.seeAlso);
generateTags(attachmentResource, attachment.tags);
}
function doExport() {
@ -3782,6 +3794,7 @@ function doExport() {
prism:"http://prismstandard.org/namespaces/1.2/basic/",
foaf:"http://xmlns.com/foaf/0.1/",
vcard:"http://nwalsh.com/rdf/vCard#",
link:"http://purl.org/rss/1.0/modules/link/",
fs:"http://chnm.gmu.edu/firefoxscholar/rdf#"
};
@ -3803,13 +3816,10 @@ function doExport() {
while(item = Scholar.nextItem()) {
items.push(item);
if(item.itemType == "attachment" && item.path) {
// file is stored locally (paths are always unique)
itemResources[item.itemID] = item.path+item.filename;
} else if(item.ISBN && !usedResources["urn:isbn:"+item.ISBN]) {
if(item.ISBN && !usedResources["urn:isbn:"+item.ISBN]) {
itemResources[item.itemID] = "urn:isbn:"+item.ISBN;
usedResources[itemResources[item.itemID]] = true;
} else if(item.url && !usedResources[item.url]) {
} else if(item.itemType != "attachment" && item.url && !usedResources[item.url]) {
itemResources[item.itemID] = item.url;
usedResources[itemResources[item.itemID]] = true;
} else {
@ -3822,17 +3832,8 @@ function doExport() {
}
for each(var attachment in item.attachments) {
if(attachment.path) {
// file is stored locally (paths are always unique)
itemResources[attachment.itemID] = attachment.path+attachment.filename;
} else if(!usedResources[attachment.url]) {
// file is referenced via url, and no other item has this url
itemResources[attachment.itemID] = attachment.url;
usedResources[attachment.url] = true;
} else {
// just specify a node ID
itemResources[attachment.itemID] = "#item:"+attachment.itemID;
}
// just specify a node ID
itemResources[attachment.itemID] = "#item:"+attachment.itemID;
}
}
@ -4103,7 +4104,8 @@ function doExport() {
Scholar.RDF.addStatement(resource, n.dcterms+"isReferencedBy", noteResource, false);
// Add see also info to RDF
generateSeeAlso(resource, item.notes[j].seeAlso);
generateSeeAlso(noteResource, item.notes[j].seeAlso);
generateTags(noteResource, item.notes[j].tags);
}
if(item.note) {
@ -4115,18 +4117,14 @@ function doExport() {
for each(var attachment in item.attachments) {
var attachmentResource = itemResources[attachment.itemID];
Scholar.RDF.addStatement(resource, n.dc+"relation", attachmentResource, false);
Scholar.RDF.addStatement(resource, n.link+"link", attachmentResource, false);
handleAttachment(attachmentResource, attachment);
}
/** TAGS **/
/** SEE ALSO AND TAGS **/
for(var j in item.tags) {
Scholar.RDF.addStatement(resource, n.dc+"subject", item.tags[j], true);
}
// Add see also info to RDF
generateSeeAlso(resource, item.seeAlso);
generateTags(resource, item.tags);
}
/** RDF COLLECTION STRUCTURE **/
@ -4314,6 +4312,12 @@ function handleAttachment(node, attachment) {
}
attachment.title = getFirstResults(node, [n.dc+"title"], true);
var path = getFirstResults(node, [rdf+"resource"]);
if(path) {
attachment.path = Scholar.RDF.getResourceURI(path[0]);
}
attachment.charset = getFirstResults(node, [n.link+"charset"], true);
attachment.mimeType = getFirstResults(node, [n.link+"type"], true);
var identifiers = getFirstResults(node, [n.dc+"identifier"]);
for each(var identifier in identifiers) {
@ -4329,28 +4333,9 @@ function handleAttachment(node, attachment) {
}
}
var formats = getFirstResults(node, [n.dc+"format"]);
for each(var format in formats) {
if(typeof(format) != "string") {
var formatType = Scholar.RDF.getTargets(format, rdf+"type");
if(formatType) {
formatType = Scholar.RDF.getResourceURI(formatType[0]);
if(formatType == n.dcterms+"IMT") { // uri is url
attachment.mimeType = getFirstResults(format, [rdf+"value"], true);
}
}
}
}
var stringNode = node;
if(typeof(stringNode) != "string") {
stringNode = Scholar.RDF.getResourceURI(stringNode);
}
if(stringNode.substr(0, 8) == "file:///") {
// not a protocol specifier; we have a path name
attachment.path = stringNode;
}
// get seeAlso and tags
processSeeAlso(node, attachment);
processTags(node, attachment);
return attachment;
}
@ -4384,6 +4369,29 @@ function processCollection(node, collection) {
return collection;
}
function processSeeAlso(node, newItem) {
var relations;
newItem.itemID = Scholar.RDF.getResourceURI(node);
newItem.seeAlso = new Array();
if(relations = getFirstResults(node, [n.dc+"relation"])) {
for each(var relation in relations) {
newItem.seeAlso.push(Scholar.RDF.getResourceURI(relation));
}
}
}
function processTags(node, newItem) {
var subjects;
newItem.tags = new Array();
if(subjects = getFirstResults(node, [n.dc+"subject"])) {
for each(var subject in subjects) {
if(typeof(subject) == "string") { // a regular tag
newItem.tags.push(subject);
}
}
}
}
// gets the node with a given type from an array
function getNodeByType(nodes, type) {
if(!nodes) {
@ -4402,6 +4410,23 @@ function getNodeByType(nodes, type) {
return false;
}
// returns true if this resource is part of another (related by any arc besides
// dc:relation or dcterms:hasPart)
//
// used to differentiate independent notes and files
function isPart(node) {
var arcs = Scholar.RDF.getArcsIn(node);
var skip = false;
for each(var arc in arcs) {
arc = Scholar.RDF.getResourceURI(arc);
if(arc != n.dc+"relation" && arc != n.dcterms+"hasPart") {
// related to another item by some arc besides see also
skip = true;
}
}
return skip;
}
function doImport() {
rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
@ -4412,6 +4437,7 @@ function doImport() {
prism:"http://prismstandard.org/namespaces/1.2/basic/",
foaf:"http://xmlns.com/foaf/0.1/",
vcard:"http://nwalsh.com/rdf/vCard#",
link:"http://purl.org/rss/1.0/modules/link/",
fs:"http://chnm.gmu.edu/firefoxscholar/rdf#"
};
@ -4432,6 +4458,13 @@ function doImport() {
newItem.itemID = Scholar.RDF.getResourceURI(node);
var container = undefined;
// figure out if this is a part of another resource, or a linked
// attachment
if(Scholar.RDF.getSources(node, n.dcterms+"isPartOf") ||
Scholar.RDF.getSources(node, n.link+"link")) {
continue;
}
// type
var type = Scholar.RDF.getTargets(node, rdf+"type");
// also deal with type detection based on parts, so we can differentiate
@ -4472,16 +4505,7 @@ function doImport() {
newItem.itemType = "website";
} else if(type == n.bib+"Memo") {
// check to see if this note is independent
var arcs = Scholar.RDF.getArcsIn(node);
var skip = false;
for each(var arc in arcs) {
arc = Scholar.RDF.getResourceURI(arc);
if(arc != n.dc+"relation" && arc != n.dcterms+"hasPart") {
// related to another item by some arc besides see also
skip = true;
}
}
if(skip) {
if(isPart(node)) {
continue;
}
@ -4490,10 +4514,9 @@ function doImport() {
// skip collections until all the items are done
collections.push(node);
continue;
} else if(type == n.fs+"File") {
} else if(type == n.fs+"Attachment") {
// check to see if file is independent
var arcs = Scholar.RDF.getArcsIn(node);
if(arcs.length) {
if(isPart(node)) {
continue;
}
@ -4627,12 +4650,7 @@ function doImport() {
newItem.journalAbbreviation = getFirstResults((container ? container : node), [n.dcterms+"alternative"], true);
// see also
var relations;
if(relations = getFirstResults(node, [n.dc+"relation"])) {
for each(var relation in relations) {
newItem.seeAlso.push(Scholar.RDF.getResourceURI(relation));
}
}
processSeeAlso(node, newItem);
/** NOTES **/
@ -4645,13 +4663,8 @@ function doImport() {
note.note = getFirstResults(referentNode, [rdf+"value", n.dc+"description"], true);
if(note.note != undefined) {
// handle see also
var relations;
if(relations = getFirstResults(referentNode, [n.dc+"relation"])) {
note.seeAlso = new Array();
for each(var relation in relations) {
note.seeAlso.push(Scholar.RDF.getResourceURI(relation));
}
}
processSeeAlso(referentNode, note);
processTags(referentNode, note);
// add note
newItem.notes.push(note);
@ -4681,18 +4694,16 @@ function doImport() {
}
}
/* ATTACHMENTS */
var relations = getFirstResults(node, [n.dc+"relation"]);
for each(var relation in relations) {
/** ATTACHMENTS **/
var relations = getFirstResults(node, [n.link+"link"]);
for each(var relation in relations) {
var type = Scholar.RDF.getTargets(relation, rdf+"type");
if(type) {
type = Scholar.RDF.getResourceURI(type[0]);
if(type == n.fs+"File") {
newItem.attachments.push(handleAttachment(relation));
}
if(Scholar.RDF.getResourceURI(type[0]) == n.fs+"Attachment") {
newItem.attachments.push(handleAttachment(relation));
}
}
Scholar.Utilities.debug(newItem);
newItem.complete();
}