2008-09-11 04:40:07 +00:00
{
2011-06-10 13:50:15 +00:00
"translatorID" : "f3f092bf-ae09-4be6-8855-a22ddd817925" ,
"label" : "ACM Digital Library" ,
"creator" : "Simon Kornblith, Michael Berkowitz and John McCaffery" ,
"target" : "^https?://[^/]*portal\\.acm\\.org[^/]*/(?:results\\.cfm|citation\\.cfm)" ,
"minVersion" : "1.0" ,
"maxVersion" : "" ,
"priority" : 100 ,
"inRepository" : true ,
"translatorType" : 4 ,
"lastUpdated" : "2011-06-10 01:14:01"
2008-09-11 04:40:07 +00:00
}
2010-11-16 19:17:59 +00:00
/ * *
* The XPath for all the search result < a > elements
* /
var searchResultX = '//td[@colspan="3"]/a[@class="medium-text" and @target="_self"]' ;
/ * *
* The XPath for all the journal TOC < a > elements
* /
var tocResultX = '//td[@colspan="1"]/span[@style]/a[contains(@href,"citation.cfm")]' ;
/ * *
* The XPath for the tag elements in a justified format tags list
* /
var justifiedTagX = '//div[@id="divtags"]/p/a' ;
/ * *
* The XPath for the tag elements in an un - justified format tags list
* /
var unjustifiedTagX = '//div[@id="divtags"]/a' ;
/ * *
* the XPath for the "more tags" link element
* /
var moreTagsX = '//a[@href="javascript:ColdFusion.Window.show(' + "'thetags'" + ')"]' ;
/ * *
* the XPath for the tag elements in the "more tags" popup
* /
var moreTagX = '//a/span[@class="small-text"]' ;
/ * *
* the XPath for the title heading element - not strictly necessary , more helpful for debugging
* /
var titleX = '//div[@class="large-text"]/h1[@class="mediumb-text"]/strong' ;
/ * *
* XPath for Table of Contents headline for journal issue
* /
var tocX = "//div[@id='citationdetails']//h5[@class='medium-text' and contains(.,'Table of Contents')]" ;
/ * *
* Scan to see what type of page this is
* @ param doc The XML document describing the page
* @ param url The URL of the page being scanned
* @ return What type of article this page is ( multiple , journal or conference proceedings )
* /
2008-09-11 04:40:07 +00:00
function detectWeb ( doc , url ) {
2010-11-16 19:17:59 +00:00
var nsResolver = getNsResolver ( doc ) ;
var title = getText ( titleX , doc , nsResolver ) ;
Zotero . debug ( "Title: " + title ) ;
2008-09-11 04:40:07 +00:00
if ( url . indexOf ( "/results.cfm" ) != - 1 ) {
2010-11-16 19:17:59 +00:00
Zotero . debug ( "Multiple items detected" ) ;
return "multiple" ;
} else if ( url . indexOf ( "/citation.cfm" ) != - 1 ) {
Zotero . debug ( "Single item detected" ) ;
return getArticleType ( doc , url , nsResolver ) ;
/ *
var type = getArticleType ( doc , url , nsResolver ) ;
if ( type . indexOf ( "conferencePaper" ) != - 1 ) {
2008-09-11 04:40:07 +00:00
return "conferencePaper" ;
2010-11-16 19:17:59 +00:00
} else
2008-09-11 04:40:07 +00:00
return "journalArticle" ;
2010-11-16 19:17:59 +00:00
} * /
2008-09-11 04:40:07 +00:00
}
}
2010-11-16 19:17:59 +00:00
/ * *
* Parse the page
* @ param doc The XML document describing the page
* @ param url The URL of the page being scanned
* /
function doWeb ( doc , url ) {
var nsResolver = getNsResolver ( doc , url ) ;
//If there are multiple pages
if ( getArticleType ( doc , url ) == "multiple" ) {
//If this is a search results page
if ( url . indexOf ( "results.cfm" ) != - 1 )
scrapeMulti ( doc , url , nsResolver , "search" ) ;
else if ( getText ( tocX , doc , nsResolver ) == "Table of Contents" )
scrapeMulti ( doc , url , nsResolver , "toc" ) ;
Zotero . wait ( ) ;
} //If this is a single page
else
scrape ( doc , url , nsResolver ) ;
}
/ * *
* Scrape search results and journal tables of contents
* @ param doc The XML document describing the page
* @ param url The URL of the page being scanned
* @ param nsResolver the namespace resolver function
* @ param type Type of result -- "search" or "toc"
* /
function scrapeMulti ( doc , url , nsResolver , type ) {
switch ( type ) {
case "toc" :
Zotero . debug ( "Scraping journal TOC" ) ;
var resultPath = doc . evaluate ( tocResultX , doc , null , XPathResult . ANY _TYPE , null ) ;
break ;
case "search" :
Zotero . debug ( "Scraping search" ) ;
default :
var resultPath = doc . evaluate ( searchResultX , doc , null , XPathResult . ANY _TYPE , null ) ;
}
2008-09-11 04:40:07 +00:00
2010-11-16 19:17:59 +00:00
//Count how mange pages have been scraped
var node ;
var urls = { } ;
//Iterate through all the results
while ( node = resultPath . iterateNext ( ) ) {
2011-02-24 21:27:13 +00:00
urls [ node . href + '&preflayout=flat' ] = node . textContent ;
2008-09-11 04:40:07 +00:00
}
2010-11-16 19:17:59 +00:00
var items = Zotero . selectItems ( urls ) ;
if ( ! items ) return true ;
var i ;
urls = [ ] ;
for ( i in items ) urls . push ( i ) ;
Zotero . Utilities . processDocuments ( urls , scrape , function ( ) { Zotero . done ( ) } ) ;
2008-09-11 04:40:07 +00:00
}
2010-11-16 19:17:59 +00:00
/ * *
* Scrape a single page
* @ param doc The XML document describing the page
* /
2008-09-11 04:40:07 +00:00
function scrape ( doc ) {
2010-11-16 19:17:59 +00:00
var url = doc . location . href ;
var nsResolver = getNsResolver ( doc , url ) ;
//Get all the details not scraped from the bibtex file
var tags = scrapeKeywords ( doc ) ;
var attachments = scrapeAttachments ( doc , url ) ;
var abs = scrapeAbstract ( doc ) ;
2011-03-31 14:31:07 +00:00
// Type not used, since it was less reliable than BibTeX
2010-11-16 19:17:59 +00:00
var type = getArticleType ( doc , url , nsResolver ) ;
2011-06-10 13:50:15 +00:00
// Manual journal not used. Some pieces are multiply published, see http://portal.acm.org/citation.cfm?id=52400.52432&coll=DL&dl=GUIDECFID=16073284&CFTOKEN=77905982
// and also http://forums.zotero.org/discussion/17532/
2010-11-16 19:17:59 +00:00
var journal = getText ( "//meta[@name='citation_journal_title']/@content" , doc , nsResolver ) ;
//Get the bibtex reference for this document as a string
var bibtex = scrapeBibtex ( url , nsResolver ) ;
//Create the new item
var newItem = new Zotero . Item ( type ) ;
//Use the bibtex translator to parse the bibtex string
var translator = Zotero . loadTranslator ( "import" ) ;
translator . setTranslator ( "9cb70025-a888-4a29-a210-93ec52da40d4" ) ;
translator . setString ( bibtex ) ;
//Set the function to run when the bibtex string has been parsed
translator . setHandler ( "itemDone" , function ( obj , newItem ) {
//Store all details not parsed from the bibtex
if ( abs ) newItem . abstractNote = abs ;
newItem . tags = tags ;
newItem . attachments = attachments ;
2011-03-31 14:31:07 +00:00
// There were issues with grabbing type from the page;
// see http://forums.zotero.org/discussion/17246/
//newItem.itemType= type;
2011-06-10 13:50:15 +00:00
/ * i f ( j o u r n a l & & j o u r n a l ! = n e w I t e m . p u b l i c a t i o n T i t l e ) {
2010-11-16 19:17:59 +00:00
newItem . journalAbbreviation = newItem . publicationTitle ;
newItem . publicationTitle = journal ;
2011-06-10 13:50:15 +00:00
} * /
2010-11-16 19:17:59 +00:00
// If the URL is just a DOI, clear it.
2011-06-10 13:50:15 +00:00
if ( newItem . url && newItem . url . match ( /^http:\/\/doi\.acm\.org\// ) ) newItem . url = "" ;
2011-01-29 16:30:20 +00:00
if ( newItem . DOI ) newItem . DOI = newItem . DOI . replace ( /^http:\/\/doi\.acm\.org\// , '' ) ;
2010-11-16 19:17:59 +00:00
var acmid = bibtex . match ( /acmid = {(\d+)}/ ) ;
if ( acmid ) newItem . extra = "ACM ID: " + acmid [ 1 ] ;
2011-06-10 13:50:15 +00:00
newItem . place = newItem . archiveLocation ;
newItem . archiveLocation = null ;
2010-11-16 19:17:59 +00:00
//Complete the parsing of the page
newItem . complete ( ) ;
} ) ;
//Trigger the translation
translator . translate ( ) ;
}
/ * *
* Scrape all keywords attached to this document
* @ param doc The XML document describing the page
* @ return an array of all keywords attached to this document
* /
function scrapeKeywords ( doc ) {
Zotero . debug ( "Scraping Keywords" ) ;
//Try scraping keywords from the "more keywords" popup
var keywords = scrapeMoreTagsKeywords ( doc ) ;
if ( keywords ) return keywords ;
keywords = new Array ( ) ;
//Otherwise look for the keywords - check justified format
var keywordPath = doc . evaluate ( justifiedTagX , doc , null , XPathResult . ANY _TYPE , null ) ;
var keywordNode = keywordPath . iterateNext ( ) ;
//If justified format didn't work check unjustified
if ( ! keywordNode ) {
keywordPath = doc . evaluate ( unjustifiedTagX , doc , null , XPathResult . ANY _TYPE , null ) ;
keywordNode = keywordPath . iterateNext ( ) ;
}
//Iterate through all the keywords
while ( keywordNode ) {
keywords . push ( Zotero . Utilities . trimInternal ( keywordNode . textContent . toLowerCase ( ) ) ) ;
Zotero . debug ( "Keyword: " + keywordNode . textContent . toLowerCase ( ) ) ;
keywordNode = keywordPath . iterateNext ( ) ;
}
return keywords ;
}
/ * *
* Scrape keywords from a "more tags" popup
* @ param doc The XML document describing the page
* @ return an array of all the keywords attached to the page which will be used as the tags for the document
* /
function scrapeMoreTagsKeywords ( doc ) {
var keywords = new Array ( ) ;
//Look for a link for a javascript code for a "more tags" popup
var morePath = doc . evaluate ( moreTagsX , doc , null , XPathResult . ANY _TYPE , null ) ;
var moreNode = morePath ? morePath . iterateNext ( ) : null ;
//If there is no "more tags" popup
if ( ! moreNode )
return null ;
var keywordPath = doc . evaluate ( moreTagX , doc , null , XPathResult . ANY _TYPE , null ) ;
var keywordNode ;
//Iterate through all the keywords
while ( keywordNode = keywordPath . iterateNext ( ) ) {
keywords . push ( Zotero . Utilities . trimInternal ( keywordNode . textContent . toLowerCase ( ) ) ) ;
Zotero . debug ( "Keyword: " + keywordNode . textContent . toLowerCase ( ) ) ;
2008-09-11 04:40:07 +00:00
}
2010-11-16 19:17:59 +00:00
return keywords ;
}
/ * *
* Scrape all the relevant attachments from the page .
* Firstly grabs a snapshot of the ACM page then looks for any links to the full text
* @ param doc The XML document describing the page
* @ param url The URL of the page being scanned
* @ return an array of all the attachments
* /
function scrapeAttachments ( doc , url ) {
Zotero . debug ( "Scraping attachments" ) ;
2008-09-11 04:40:07 +00:00
var attachments = new Array ( ) ;
2010-11-16 19:17:59 +00:00
//Add the scrapeshot of this page
attachments . push ( { title : "ACM Snapshot" , mimeType : "text/html" , url : url } ) ;
//XPath for the full text links
2011-03-30 16:23:09 +00:00
var textPath = doc . evaluate ( '//a[@name="FullTextPdf" or @name="FullTextHtml" or @name="FullText Html"][not(ancestor::div[@class="flatbody"])]' , doc , null , XPathResult . ANY _TYPE , null ) ;
2010-11-16 19:17:59 +00:00
var textNode ;
//Iterate through all the links
while ( textNode = textPath . iterateNext ( ) ) {
var textURL = textNode . href ;
//If the full text is a pdf
if ( textNode . name == "FullTextPdf" ) {
Zotero . debug ( "Text PDF: " + textURL ) ;
attachments . push ( { title : "ACM Full Text PDF" , mimeType : "application/pdf" , url : textURL } ) ;
} else { //Otherwise the text is an HTML link
Zotero . debug ( "Text Page: " + textURL ) ;
attachments . push ( { title : "ACM Full Text HTML" , mimeType : "text/html" , url : textURL } ) ;
2008-09-11 04:40:07 +00:00
}
}
2010-11-16 19:17:59 +00:00
return attachments ;
}
/ * *
* Scrape the abstract from the page
* @ param doc The XML document describing the page
* @ param url The URL of the page being scanned
* @ return a string with the text of the abstract
* /
function scrapeAbstract ( doc ) {
Zotero . debug ( "Scraping abstract" ) ;
2011-02-24 21:27:13 +00:00
var text = getText ( '//div[@class="flatbody" or @class="tabbody"]' , doc ) ;
2011-03-31 14:31:07 +00:00
return Zotero . Utilities . trimInternal ( text ) ;
2010-11-16 19:17:59 +00:00
}
/ * *
* Get the text of the bibtex format reference
* @ param url The URL of the page being scanned
* @ param nsResolver the namespace resolver function
* @ return the bibtex reference as a trimmed string
* /
function scrapeBibtex ( url , nsResolver ) {
Zotero . debug ( "Scraping full details from bibtex" ) ;
//Get the ID of this document
var id = getId ( url ) ;
//The link of the bibtex popup
var bibtex = "http://portal.acm.org/exportformats.cfm?id=" + id + "&expformat=bibtex" ;
2008-09-11 04:40:07 +00:00
2010-11-16 19:17:59 +00:00
Zotero . debug ( "Bibtex: " + bibtex ) ;
//Get the xml document which will be loaded into the popup box
var texDoc = Zotero . Utilities . retrieveDocument ( bibtex ) ;
//Find the node with the bibtex text in it
var path = texDoc . evaluate ( '//pre' , texDoc , nsResolver , XPathResult . ANY _TYPE , null ) ;
var node = path . iterateNext ( ) ;
2008-09-11 04:40:07 +00:00
2010-11-16 19:17:59 +00:00
if ( node != null && node . textContent != null ) {
var ref = node . textContent ;
Zotero . debug ( "\nref : " + ( ref == null ? "null" : ref ) ) ;
ref = Zotero . Utilities . trimInternal ( ref ) ;
ref = Zotero . Utilities . trim ( ref ) ;
return ref ;
2008-09-11 04:40:07 +00:00
}
2010-11-16 19:17:59 +00:00
return null ;
}
/ * *
* Get the unique identifier of this document
* @ param url The URL of the page being scanned
* @ param journal [ optional ] whether to get the ID of the journal the document is in or of the document itself
* @ return a string containing the identifier of the document or journal the document is in
* /
function getId ( url , journal ) {
if ( journal === undefined )
journal = false ;
var cfmIndex = url . indexOf ( ".cfm" ) ;
var atIndex = url . indexOf ( '&' ) ;
var id = url . substr ( cfmIndex + 8 ) ;
if ( atIndex != - 1 )
id = id . replace ( url . substring ( atIndex ) , "" ) ;
var dotIndex = id . indexOf ( '.' ) ;
if ( dotIndex != - 1 )
if ( ! journal )
id = id . replace ( id . substring ( 0 , ( dotIndex + 1 ) ) , "" ) ;
else
id = id . replace ( id . substring ( dotIndex ) , "" ) ;
return id ;
}
/ * *
* Find out what kind of document this is
* @ param doc The XML document describing the page
* @ param url The URL of the page being scanned
* @ param nsResolver the namespace resolver function
* @ return a string with either "multiple" , "journalArticle" or "conferencePaper" in it , depending on the type of document
* /
function getArticleType ( doc , url , nsResolver ) {
var toc = doc . evaluate ( tocX , doc , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
if ( url . indexOf ( "results.cfm" ) != - 1 || toc ) {
Zotero . debug ( "Type: multiple" ) ;
return "multiple" ;
2008-09-11 04:40:07 +00:00
}
2010-11-16 19:17:59 +00:00
//XPath for the table cell which has either "Journal" or "Proceeding" in it
var text = getText ( '//td[@nowrap="nowrap" and @style="padding-bottom: 0px;"]' , doc , nsResolver ) ;
Zotero . debug ( "Type: " + text ) ;
if ( text . indexOf ( "Proceeding" ) != - 1 )
return "conferencePaper" ;
else if ( text . indexOf ( "Magazine" ) != - 1 )
return "magazineArticle" ;
else
return "journalArticle" ;
2008-09-11 04:40:07 +00:00
}
2010-11-16 19:17:59 +00:00
/ * *
* Get the text from the first node defined by the given xPathString
* @ param pathString the XPath indicating which node to get the text from
* @ param doc The XML document describing the page
* @ param nsResolver the namespace resolver function
* @ return the text in the defined node or "Unable to scrape text" if the node was not found or if there was no text content
* /
function getText ( pathString , doc , nsResolver ) {
var path = doc . evaluate ( pathString , doc , nsResolver , XPathResult . ANY _TYPE , null ) ;
var node = path . iterateNext ( ) ;
if ( node == null || node . textContent == undefined || node . textContent == null ) {
Zotero . debug ( "Unable to retrieve text for XPath: " + pathString ) ;
return "" ;
2008-09-11 04:40:07 +00:00
}
2010-11-16 19:17:59 +00:00
return node . textContent ;
}
/ * *
* Get a function for returning the namespace of a given document given its prefix
* @ param nsResolver the namespace resolver function
* /
function getNsResolver ( doc ) {
var namespace = doc . documentElement . namespaceURI ;
var nsResolver = namespace ? function ( prefix ) {
if ( prefix == 'x' ) return namespace ;
else return null ;
} : null ;
2008-09-11 04:40:07 +00:00
2010-11-16 19:17:59 +00:00
return nsResolver ;
}