408 lines
13 KiB
JavaScript
408 lines
13 KiB
JavaScript
{
|
|
"translatorID":"f3f092bf-ae09-4be6-8855-a22ddd817925",
|
|
"label":"ACM Digital Library",
|
|
"creator":"Simon Kornblith, Michael Berkowitz and John McCaffery",
|
|
"target":"^https?://[^/]*portal\\.acm\\.org[^/]*/(?:results\\.cfm|citation\\.cfm)",
|
|
"minVersion":"1.0",
|
|
"maxVersion":"",
|
|
"priority":100,
|
|
"inRepository":"1",
|
|
"translatorType":4,
|
|
"lastUpdated":"2010-11-10 23:55:19"
|
|
}
|
|
|
|
/**
|
|
* The XPath for all the search result <a> elements
|
|
*/
|
|
var searchResultX = '//td[@colspan="3"]/a[@class="medium-text" and @target="_self"]';
|
|
/**
|
|
* The XPath for all the journal TOC <a> elements
|
|
*/
|
|
var tocResultX = '//td[@colspan="1"]/span[@style]/a[contains(@href,"citation.cfm")]';
|
|
|
|
/**
|
|
* The XPath for the tag elements in a justified format tags list
|
|
*/
|
|
var justifiedTagX = '//div[@id="divtags"]/p/a';
|
|
/**
|
|
* The XPath for the tag elements in an un-justified format tags list
|
|
*/
|
|
var unjustifiedTagX = '//div[@id="divtags"]/a';
|
|
/**
|
|
* the XPath for the "more tags" link element
|
|
*/
|
|
var moreTagsX = '//a[@href="javascript:ColdFusion.Window.show(' + "'thetags'" + ')"]';
|
|
/**
|
|
* the XPath for the tag elements in the "more tags" popup
|
|
*/
|
|
var moreTagX = '//a/span[@class="small-text"]';
|
|
/**
|
|
* the XPath for the title heading element - not strictly necessary, more helpful for debugging
|
|
*/
|
|
var titleX = '//div[@class="large-text"]/h1[@class="mediumb-text"]/strong';
|
|
/**
|
|
* XPath for Table of Contents headline for journal issue
|
|
*/
|
|
var tocX = "//div[@id='citationdetails']//h5[@class='medium-text' and contains(.,'Table of Contents')]";
|
|
|
|
|
|
|
|
/**
|
|
* Scan to see what type of page this is
|
|
* @param doc The XML document describing the page
|
|
* @param url The URL of the page being scanned
|
|
* @return What type of article this page is (multiple, journal or conference proceedings)
|
|
*/
|
|
function detectWeb(doc, url) {
|
|
var nsResolver = getNsResolver(doc);
|
|
var title = getText(titleX, doc, nsResolver);
|
|
Zotero.debug("Title: " + title);
|
|
|
|
if(url.indexOf("/results.cfm") != -1) {
|
|
Zotero.debug("Multiple items detected");
|
|
return "multiple";
|
|
} else if (url.indexOf("/citation.cfm") != -1) {
|
|
Zotero.debug("Single item detected");
|
|
return getArticleType(doc, url, nsResolver);
|
|
/*
|
|
var type = getArticleType(doc, url, nsResolver);
|
|
if (type .indexOf("conferencePaper") != -1) {
|
|
return "conferencePaper";
|
|
} else
|
|
return "journalArticle";
|
|
}*/
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse the page
|
|
* @param doc The XML document describing the page
|
|
* @param url The URL of the page being scanned
|
|
*/
|
|
function doWeb(doc, url) {
|
|
Zotero.debug("test do");
|
|
var nsResolver = getNsResolver(doc, url);
|
|
|
|
//If there are multiple pages
|
|
if (getArticleType(doc, url) == "multiple") {
|
|
//If this is a search results page
|
|
if (url.indexOf("results.cfm") != -1)
|
|
scrapeMulti(doc, url, nsResolver, "search");
|
|
else if(getText(tocX, doc, nsResolver) =="Table of Contents")
|
|
scrapeMulti(doc, url, nsResolver, "toc");
|
|
Zotero.wait();
|
|
} //If this is a single page
|
|
else
|
|
scrape(doc, url, nsResolver);
|
|
}
|
|
|
|
/**
|
|
* Scrape search results and journal tables of contents
|
|
* @param doc The XML document describing the page
|
|
* @param url The URL of the page being scanned
|
|
* @param nsResolver the namespace resolver function
|
|
* @param type Type of result-- "search" or "toc"
|
|
*/
|
|
function scrapeMulti(doc, url, nsResolver, type) {
|
|
switch(type) {
|
|
case "toc":
|
|
Zotero.debug("Scraping journal TOC");
|
|
var resultPath= doc.evaluate(tocResultX, doc, null, XPathResult.ANY_TYPE, null);
|
|
break;
|
|
case "search":
|
|
Zotero.debug("Scraping search");
|
|
default:
|
|
var resultPath= doc.evaluate(searchResultX, doc, null, XPathResult.ANY_TYPE, null);
|
|
}
|
|
Zotero.debug("hi"+resultPath.iterateNext().textContent);
|
|
|
|
//Count how mange pages have been scraped
|
|
var node;
|
|
var urls = {};
|
|
//Iterate through all the results
|
|
while(node= resultPath.iterateNext()) {
|
|
urls[node.href] = node.textContent;
|
|
}
|
|
|
|
var items = Zotero.selectItems(urls);
|
|
if(!items) return true;
|
|
|
|
var i;
|
|
urls = [];
|
|
for (i in items) urls.push(i);
|
|
|
|
Zotero.Utilities.processDocuments(urls, scrape, function(){Zotero.done()});
|
|
}
|
|
|
|
/**
|
|
* Scrape a single page
|
|
* @param doc The XML document describing the page
|
|
*/
|
|
function scrape(doc) {
|
|
var url = doc.location.href;
|
|
var nsResolver = getNsResolver(doc, url);
|
|
|
|
//Get all the details not scraped from the bibtex file
|
|
var tags = scrapeKeywords(doc);
|
|
var attachments = scrapeAttachments(doc, url);
|
|
var abs = scrapeAbstract(doc);
|
|
var type = getArticleType(doc, url, nsResolver);
|
|
var journal = getText("//meta[@name='citation_journal_title']/@content",doc, nsResolver);
|
|
//Get the bibtex reference for this document as a string
|
|
var bibtex = scrapeBibtex(url, nsResolver);
|
|
|
|
//Create the new item
|
|
var newItem = new Zotero.Item(type);
|
|
|
|
//Use the bibtex translator to parse the bibtex string
|
|
var translator = Zotero.loadTranslator("import");
|
|
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
|
|
translator.setString(bibtex );
|
|
//Set the function to run when the bibtex string has been parsed
|
|
translator.setHandler("itemDone", function(obj, newItem) {
|
|
//Store all details not parsed from the bibtex
|
|
if(abs) newItem.abstractNote = abs;
|
|
newItem.tags = tags;
|
|
newItem.attachments = attachments;
|
|
newItem.itemType= type;
|
|
if (journal && journal != newItem.publicationTitle) {
|
|
newItem.journalAbbreviation = newItem.publicationTitle;
|
|
newItem.publicationTitle = journal;
|
|
}
|
|
// If the URL is just a DOI, clear it.
|
|
if (newItem.url.match(/^http:\/\/doi\.acm\.org\//)) newItem.url = "";
|
|
newItem.DOI = newItem.DOI.replace(/^http:\/\/doi\.acm\.org\//, '');
|
|
var acmid = bibtex.match(/acmid = {(\d+)}/);
|
|
if(acmid) newItem.extra = "ACM ID: "+ acmid[1];
|
|
//Complete the parsing of the page
|
|
newItem.complete();
|
|
});
|
|
|
|
//Trigger the translation
|
|
translator.translate();
|
|
}
|
|
|
|
/**
|
|
* Scrape all keywords attached to this document
|
|
* @param doc The XML document describing the page
|
|
* @return an array of all keywords attached to this document
|
|
*/
|
|
function scrapeKeywords(doc) {
|
|
Zotero.debug("Scraping Keywords");
|
|
//Try scraping keywords from the "more keywords" popup
|
|
var keywords = scrapeMoreTagsKeywords(doc);
|
|
|
|
if (keywords) return keywords;
|
|
|
|
keywords = new Array();
|
|
|
|
//Otherwise look for the keywords - check justified format
|
|
var keywordPath = doc.evaluate(justifiedTagX, doc, null, XPathResult.ANY_TYPE, null);
|
|
var keywordNode = keywordPath.iterateNext();
|
|
//If justified format didn't work check unjustified
|
|
if (!keywordNode) {
|
|
keywordPath = doc.evaluate(unjustifiedTagX, doc, null, XPathResult.ANY_TYPE, null);
|
|
keywordNode = keywordPath.iterateNext();
|
|
}
|
|
//Iterate through all the keywords
|
|
while(keywordNode) {
|
|
keywords.push(Zotero.Utilities.trimInternal(keywordNode .textContent.toLowerCase()));
|
|
Zotero.debug("Keyword: " + keywordNode .textContent.toLowerCase());
|
|
keywordNode = keywordPath.iterateNext();
|
|
}
|
|
|
|
return keywords;
|
|
}
|
|
|
|
/**
|
|
* Scrape keywords from a "more tags" popup
|
|
* @param doc The XML document describing the page
|
|
* @return an array of all the keywords attached to the page which will be used as the tags for the document
|
|
*/
|
|
function scrapeMoreTagsKeywords(doc) {
|
|
var keywords = new Array();
|
|
|
|
//Look for a link for a javascript code for a "more tags" popup
|
|
var morePath = doc.evaluate(moreTagsX, doc, null, XPathResult.ANY_TYPE, null);
|
|
var moreNode = morePath ? morePath.iterateNext() : null;
|
|
//If there is no "more tags" popup
|
|
if (!moreNode)
|
|
return null;
|
|
|
|
var keywordPath = doc.evaluate(moreTagX, doc, null, XPathResult.ANY_TYPE, null);
|
|
|
|
var keywordNode;
|
|
//Iterate through all the keywords
|
|
while(keywordNode = keywordPath.iterateNext()) {
|
|
keywords.push(Zotero.Utilities.trimInternal(keywordNode .textContent.toLowerCase()));
|
|
Zotero.debug("Keyword: " + keywordNode .textContent.toLowerCase());
|
|
}
|
|
return keywords;
|
|
}
|
|
|
|
/**
|
|
* Scrape all the relevant attachments from the page.
|
|
* Firstly grabs a snapshot of the ACM page then looks for any links to the full text
|
|
* @param doc The XML document describing the page
|
|
* @param url The URL of the page being scanned
|
|
* @return an array of all the attachments
|
|
*/
|
|
function scrapeAttachments(doc, url) {
|
|
Zotero.debug("Scraping attachments");
|
|
var attachments = new Array();
|
|
|
|
//Add the scrapeshot of this page
|
|
attachments.push({title:"ACM Snapshot", mimeType:"text/html", url:url});
|
|
|
|
//XPath for the full text links
|
|
var textPath = doc.evaluate('//a[@name="FullTextPdf" or @name="FullTextHtml" or @name="FullText Html"]', doc, null, XPathResult.ANY_TYPE, null);
|
|
|
|
var textNode;
|
|
//Iterate through all the links
|
|
while (textNode= textPath .iterateNext()) {
|
|
var textURL= textNode.href;
|
|
|
|
//If the full text is a pdf
|
|
if (textNode.name == "FullTextPdf") {
|
|
Zotero.debug("Text PDF: " + textURL);
|
|
attachments.push({title:"ACM Full Text PDF", mimeType:"application/pdf", url:textURL});
|
|
} else { //Otherwise the text is an HTML link
|
|
Zotero.debug("Text Page: " + textURL);
|
|
attachments.push({title:"ACM Full Text HTML", mimeType:"text/html", url:textURL});
|
|
}
|
|
}
|
|
|
|
return attachments;
|
|
}
|
|
|
|
/**
|
|
* Scrape the abstract from the page
|
|
* @param doc The XML document describing the page
|
|
* @param url The URL of the page being scanned
|
|
* @return a string with the text of the abstract
|
|
*/
|
|
function scrapeAbstract(doc) {
|
|
Zotero.debug("Scraping abstract");
|
|
var text = getText('//div[@style="display: inline;"]', doc);
|
|
return text;
|
|
}
|
|
|
|
/**
|
|
* Get the text of the bibtex format reference
|
|
* @param url The URL of the page being scanned
|
|
* @param nsResolver the namespace resolver function
|
|
* @return the bibtex reference as a trimmed string
|
|
*/
|
|
function scrapeBibtex(url, nsResolver) {
|
|
Zotero.debug("Scraping full details from bibtex");
|
|
//Get the ID of this document
|
|
var id = getId(url);
|
|
//The link of the bibtex popup
|
|
var bibtex = "http://portal.acm.org/exportformats.cfm?id=" + id + "&expformat=bibtex";
|
|
|
|
Zotero.debug("Bibtex: " + bibtex);
|
|
|
|
//Get the xml document which will be loaded into the popup box
|
|
var texDoc = Zotero.Utilities.retrieveDocument(bibtex);
|
|
//Find the node with the bibtex text in it
|
|
var path = texDoc.evaluate('//pre', texDoc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
var node = path.iterateNext();
|
|
|
|
if (node != null && node.textContent != null) {
|
|
var ref = node.textContent;
|
|
Zotero.debug("\nref : " + (ref == null ? "null":ref));
|
|
ref = Zotero.Utilities.trimInternal(ref);
|
|
ref = Zotero.Utilities.trim(ref);
|
|
|
|
return ref;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Get the unique identifier of this document
|
|
* @param url The URL of the page being scanned
|
|
* @param journal [optional]whether to get the ID of the journal the document is in or of the document itself
|
|
* @return a string containing the identifier of the document or journal the document is in
|
|
*/
|
|
function getId(url, journal) {
|
|
if (journal=== undefined)
|
|
journal= false;
|
|
|
|
var cfmIndex = url.indexOf(".cfm");
|
|
var atIndex = url.indexOf('&');
|
|
|
|
var id = url.substr(cfmIndex + 8);
|
|
|
|
if (atIndex != -1)
|
|
id = id.replace(url.substring(atIndex), "");
|
|
|
|
var dotIndex = id.indexOf('.');
|
|
if (dotIndex != -1)
|
|
if (!journal)
|
|
id = id.replace(id .substring(0, (dotIndex+1)), "");
|
|
else
|
|
id = id.replace(id .substring(dotIndex), "");
|
|
|
|
return id;
|
|
}
|
|
|
|
/**
|
|
* Find out what kind of document this is
|
|
* @param doc The XML document describing the page
|
|
* @param url The URL of the page being scanned
|
|
* @param nsResolver the namespace resolver function
|
|
* @return a string with either "multiple", "journalArticle" or "conferencePaper" in it, depending on the type of document
|
|
*/
|
|
function getArticleType(doc, url, nsResolver) {
|
|
var toc = doc.evaluate(tocX, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
|
if (url.indexOf("results.cfm") != -1 || toc) {
|
|
Zotero.debug("Type: multiple");
|
|
return "multiple";
|
|
}
|
|
|
|
//XPath for the table cell which has either "Journal" or "Proceeding" in it
|
|
var text = getText('//td[@nowrap="nowrap" and @style="padding-bottom: 0px;"]', doc, nsResolver);
|
|
|
|
Zotero.debug("Type: " + text);
|
|
if (text.indexOf("Proceeding") != -1)
|
|
return "conferencePaper";
|
|
else if (text.indexOf("Magazine") != -1)
|
|
return "magazineArticle";
|
|
else
|
|
return "journalArticle";
|
|
|
|
}
|
|
|
|
/**
|
|
* Get the text from the first node defined by the given xPathString
|
|
* @param pathString the XPath indicating which node to get the text from
|
|
* @param doc The XML document describing the page
|
|
* @param nsResolver the namespace resolver function
|
|
* @return the text in the defined node or "Unable to scrape text" if the node was not found or if there was no text content
|
|
*/
|
|
function getText(pathString, doc, nsResolver) {
|
|
var path = doc.evaluate(pathString, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
var node = path.iterateNext();
|
|
|
|
if (node == null || node.textContent == undefined || node.textContent == null) {
|
|
Zotero.debug("Unable to retrieve text for XPath: "+pathString);
|
|
return "";
|
|
}
|
|
|
|
return node.textContent;
|
|
}
|
|
|
|
/**
|
|
* Get a function for returning the namespace of a given document given its prefix
|
|
* @param nsResolver the namespace resolver function
|
|
*/
|
|
function getNsResolver(doc) {
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == 'x') return namespace;
|
|
else return null;
|
|
} : null;
|
|
|
|
return nsResolver;
|
|
}
|