zotero/chrome/content/zotero/webpagedump/domsaver.js

/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is ScrapBook.
 *
 * The Initial Developer of the Original Code is Gomita.
 * Portions created by the Initial Developer are Copyright (C) 2004
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Bernhard Pollak <pollak@dbai.tuwien.ac.at> (WebPageDump Fork)
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU Affero General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */
// --------------------------------------------------------------------------------
// "WebPageDump" Firefox Extension
// --------------------------------------------------------------------------------
// - File: "domsaver.js" -
// - Description:
//   Makes a (hopefully perfect) local copy of the actual open webpage.
//   Current Browsers make sometimes errors when saving a webpage. The files
//   will be saved in one flat directory (without subdirs)
// - Using:
//   1. call "wpdDOMSaver.init(filePath)" and pass the full destination path
//   2. afterwards call "wpdDOMSaver.saveHTMLDocument" for saving the (active) window
// --------------------------------------------------------------------------------
// Call Tree Overview - wpdDOMSaver
//
// saveHTMLDocument
//	saveDocumentEx          (decide if we have a HTML or another file)
//	  saveDocumentFile        (we have a non HTML file (e.g for embedded objects - images, movies,...))
//	    download								(we download the file and ...)
//      writefile               (... make a HTML wrapper file)
//	  saveDocumentHTML        (we have a HTML File)
//			processDOMRecursively   (go through the DOM nodes)
//			  processDOMNode          (for each node we do extensive processing (links, javascript,...))
//			    download                (for image,flash,... references)
//			    saveDocumentEx ...  	  (starting again with "saveDocumentEx" for frame documents)
//      saveDocumentCSS       (save CSS File)
//			  processCSSRecursively   (process the CSS text)
//			    processCSSText          (do some replacement stuff and link processing)
//			      download                (download CSS image references)
//      generateHTMLString     (create the HTML string)
//
//
// --------------------------------------------------------------------------------
//
//
// TO DO: use version information from rdf file...
var WPD_VERSION = "0.2";


// Bug variables: set to false if the bug is not present anymore

// CRLFBUG: Innerhtml trims the text inside a tag. This lead
// to problems with the PRE Tag where sometimes one starting
// carriage return is lost...
var WPD_CRLFBUG = true;

// ENTITYBUG: HTML Entities are lost inside the DOM Tree (they
// are converted to corresponding unicode characters) which
// results in problems when using a non unicode charset as output
// target where this values/symbols do not exist. So we call
// the ConvertToEntities XPCOM function for generating usual
// HTML Entities...
// (this is precisely not a bug but a concept failure)
var WPD_ENTITYBUG = false;

// CSSSCROLLBUG: The css "scroll" property of "background" is
// loosing the zero vertical position leading to a false
// positioned background (centered by default)...
var WPD_CSSSCROLLBUG = true;
// CSSBACKGROUNDPOSITIONBUG: "background-position 0 0" is
// loosing the zero vertical position
var WPD_CSSBACKGROUNDPOSITIONBUG = true;

// DOCTYPEBUG: If the doctype is inserted before
// the <HTML> tag there would be rendering errors with the
// right to left writing direction, because there are problems
// with the DIR attribute (text direction (rtl,ltr,lro,rlo))
// Positioning the doctype below the <HTML> Tag would fix the
// problem.
// But: inserting the docctype only below the <HTML> tag
// results in small layout changes in some tables. So we
// leave the doctype at the original position  before the
// HTML tag <HTML> and insert the doctype entry  a second
// time below the <HTML> tag...
var WPD_DOCTYPEBUG = false;

// JAVASCRIPTSRCBUG: Deleting the "src" attribute together
// with the whole <SCRIPT> tag may result in unexpected
// layout changes (table width is changed). So we set the
// "src" attribute of the <SCRIPT> tag to an empty string
// and don<6F>t delete the whole tag...
// Remark: it may be necessary to use an invalid ip address
// (e.g. http://0.0.0.0) but this may lead to other strange
// layout dependencies...
var WPD_JAVASCRIPTSRCBUG = true;

// CLONENODEBUG: CloneNode copies only the initial state of
// the INPUT fields and ignores the actual values of the fields
// We introduced this.curBody and the getCurrentNodeValue function.
var WPD_CLONENODEBUG = true;


var wpdDOMSaver = {

	name: "",
	document: null, // the original document
	curDocument: null, // the current document
	curCharacterSet: "", // the current characterset
	curBody: null, // the current body node (inclusive child nodes)
	currentDir: "",
	baseURL: "", // the original base url
	currentURL: "", // the current url (necessary for frames)
	fileInfo: [], // for saving already processed files and double name checking
	// (cause we use one flat directory for all files)
	option: {},
	frameList: [],
	frameNumber: 0,
	dateObj: null,

	// initialize the properties (set document, URL, Directory, ...)
	init: function (fileName, document) {
		Zotero.debug("[wpdDOMSaver.init] ...");

		this.name = "";
		this.document = null;
		this.curDocument = null;
		this.curCharacterSet = "";
		this.curBody = null;
		this.currentDir = "";
		this.baseURL = "";
		this.currentURL = "";
		this.fileInfo = []; // clear registered downloaded files...

		this.option = {};
		this.frameList = []; // clear frame list
		this.frameNumber = 0;

		this.dateObj = new Date();


		// Split fileName in Path and Name

		this.name = wpdCommon.getValidFileName(
			wpdCommon.getFileLeafName(fileName)); // extract fileName from filePath
		this.currentDir = wpdCommon.getFilePath(fileName); // only directory
		this.name = wpdCommon.splitFileName(this.name)[0]; // no extension!


		// Added by Dan S. for Zotero, replacing three lines below
		this.document = document;
		this.setFrameList(document.defaultView);
		this.baseURL = document.location.href;


		// Set the document and frames
		//this.document = top.window._content.document;

		//this.setFrameList(top.window._content);

		// set the urls
		//this.baseURL = wpdCommon.getURL();               // initial base url
		this.currentURL = this.baseURL; // current base url - needed for frame processing
		//   (without frames this property will always be like the baseURL)

		// default options - for the files which should be downloaded
		// (this is only for external link references not for the embedded files)
		this.option = {
			"image": false,
				"sound": false,
				"movie": false,
				"archive": false,
				"custom": "", // comma delimited custom extensions (e.g. doc,xls,...)
			"format": true, // when false we get only naked html without images

			// Changed by Dan for Zotero
			"script": true, // no scripts

			"encodeUTF8": true, // write the DOM Tree as UTF-8 and change the charset entry of the document
			"metainfo": true, // include meta tags with URL and date/time information
			"metacharset": false // if the meta charset is defined inside html override document charset
			//"xtagging"    : true      // include a x tag around each word
		};


	},


	// get all frames in the document (recursively) and save in this.frameList
	setFrameList: function (aDocument) {
		try {
			for (var f = 0; f < aDocument.frames.length; f++) {
				this.frameList.push(aDocument.frames[f]);
				this.setFrameList(aDocument.frames[f]);
			}
		} catch (ex) {}
	},

	// resolve the javascript links inside the attributes (e.g. onclick,...)
	normalizeJavaScriptLink: function (aNode, aAttr) {
		var val = aNode.getAttribute(aAttr); // get the attribute value and check for link stuff
		if (!val || !val.match(/\(\'([^\']+)\'/)) return aNode;
		val = RegExp.$1;
		if (val.indexOf("/") == -1 && val.indexOf(".") == -1) return aNode;
		val = wpdCommon.resolveURL(this.currentURL, val); // it is a link -> resolve and set the URL to the local URL
		if (aNode.nodeName.toLowerCase() == "img") {
			if (aNode.parentNode.nodeName.toLowerCase() == "a") {
				aNode.parentNode.setAttribute("href", val); // change the href of img to the onclick url
				aNode.removeAttribute("onclick");
			} else {
				val = "window.open('" + val + "');"; // if this is not a reference make a window open function for the img
				aNode.setAttribute(aAttr, val);
			}
		} else {
			if (aNode.hasAttribute("href") && aNode.getAttribute("href").indexOf("http://") != 0) {
				aNode.setAttribute("href", val);
				aNode.removeAttribute("onclick");
			}
		}
		return aNode;
	},

	// check if the file extension of the url is specified in the options array
	checkFileTypeOptions: function (aURL) {
		var ext = wpdCommon.splitFileName(wpdCommon.getFileName(aURL))[1].toLowerCase();
		var flag = false;
		switch (ext) {
			case "jpg":
			case "jpeg":
			case "png":
			case "gif":
				flag = this.option["image"];
				break;
			case "mp3":
			case "wav":
			case "ram":
			case "wma":
				flag = this.option["sound"];
				break;
			case "mpg":
			case "mpeg":
			case "avi":
			case "ram":
			case "rm":
			case "mov":
			case "wmv":
				flag = this.option["movie"];
				break;
			case "zip":
			case "lzh":
			case "rar":
			case "xpi":
				flag = this.option["archive"];
				break;
			default:
				if (ext && this.option["custom"]) {
					if ((", " + this.option["custom"] + ", ").indexOf(", " + ext + ", ") != -1) flag = true;
				}
		}
		if (aURL.indexOf("file://") == 0 && !aURL.match(/\.html$/)) flag = true;
		return flag;
	},


	// do the conversion from the DOM Text to the destination Charset
	convertEntity: function (aText) {
		if (this.option["encodeUTF8"]) {
			return wpdCommon.unicodeToEntity(aText, "UTF-8");
		} else {
			return wpdCommon.unicodeToEntity(aText, this.curCharacterSet);
		}
	},

	// we only can manage GIF animations - Flash does not work...
	disableAnimation: function (aNode) {
		// thanx to pageanimator extension...
		/* try {
      //dump("inspecting "+aNode.nodeName+"\n");
      //aNode.setAttribute("swLiveConnect", "true");
      aNode.StopPlay();
      dump ("prepare flash deanimation ... ");
      if ( aNode.hasAttribute("play") ) aNode.setAttribute("play", "false");
      dump ("flash deanimation ... ");
      aNode.Rewind(); // seems to be the key for some obnoxious instances
      aNode.StopPlay();
      dump ("ready! \n");
    } catch (e) {}     */
		try {
			var container = aNode.QueryInterface(Components.interfaces.nsIImageLoadingContent)
				.getRequest(Components.interfaces.nsIImageLoadingContent.CURRENT_REQUEST)
				.image;
			container.animationMode = Components.interfaces.imgIContainer.kDontAnimMode;
		} catch (e) {}
	},

	// get the node value of aNode directly from the actual DOM tree (WPD_CLONENODEBUG)
	getCurrentNodeValue: function (aNode) {
		try {
			this.curDocument.body.cloneNode(false);
			var body = this.curDocument.body;
		} catch (ex) {
			var body = this.curDocument.getElementsByTagName("body")[0];
		}
		var refnodes = body.getElementsByTagName(aNode.nodeName);
		var nodes = this.curBody.getElementsByTagName(aNode.nodeName);
		if (refnodes.length != nodes.length) return aNode.value;
		for (var i = 0; i < refnodes.length; i++) {
			if ((nodes[i] == aNode) && (refnodes[i].name == aNode.name) && (refnodes[i].defaultValue == aNode.defaultValue)) {
				return refnodes[i].value;
			}
		}
		return aNode.value;
	},

	// process the DOM Node (update the links, remove attributes and process the options)
	processDOMNode: function (aNode) {
		this.disableAnimation(aNode);
		try {
			switch (aNode.nodeName.toLowerCase()) {
				case "img":
				case "embed":
					// "embed": embedding multimedia content
					if (this.option["format"]) {
						if (aNode.hasAttribute("onclick")) aNode = this.normalizeJavaScriptLink(aNode, "onclick");
						var aDownload = true;
						if (aNode.nodeName.toLowerCase() == "img") {
							try {
								aDownload = aNode.complete;
							} catch (ex) {}
						}
						var aFileName = this.download(aNode.src, aDownload);
						// Changed by Dan S. for Zotero -- see this.repairRelativeLinks()
						if (aFileName) aNode.setAttribute("src", this.relativeLinkFix(aFileName));
					} else {
						return wpdCommon.removeNodeFromParent(aNode);
					}
					break;
				case "object":
					// for embedding different data sources in the html page
					if (this.option["format"]) {
						var aFileName = this.download(aNode.data, true);
						// Changed by Dan S. for Zotero -- see this.repairRelativeLinks()
						if (aFileName) aNode.setAttribute("data", this.relativeLinkFix(aFileName));
					} else {
						return wpdCommon.removeNodeFromParent(aNode);
					}
					break;
				case "body":
					if (this.option["format"]) {
						var aFileName = this.download(aNode.background, true);
						// Changed by Dan S. for Zotero -- see this.repairRelativeLinks()
						if (aFileName) aNode.setAttribute("background", this.relativeLinkFix(aFileName));
					} else {
						aNode.removeAttribute("background");
						aNode.removeAttribute("bgcolor");
						aNode.removeAttribute("text");
					}
					break;
				case "table":
				case "tr":
				case "th":
				case "td":
					if (this.option["format"]) {
						var aFileName = this.download(aNode.getAttribute("background"), true);
						// Changed by Dan S. for Zotero -- see this.repairRelativeLinks()
						if (aFileName) aNode.setAttribute("background", this.relativeLinkFix(aFileName));
					} else {
						aNode.removeAttribute("background");
						aNode.removeAttribute("bgcolor");
					}
					break;
				case "input":
					if (aNode.type.toLowerCase() == "image") {
						if (this.option["format"]) {
							var aFileName = this.download(aNode.src, true);
							// Changed by Dan S. for Zotero -- see this.repairRelativeLinks()
							if (aFileName) aNode.setAttribute("src", this.relativeLinkFix(aFileName));
						} else {
							aNode.setAttribute("type", "button");
							aNode.removeAttribute("src");
						}
					} else if ((aNode.type.toLowerCase() != "hidden") && (aNode.hasAttribute("value"))) {
						if (WPD_CLONENODEBUG) aNode.setAttribute("value", this.getCurrentNodeValue(aNode));
						if (WPD_ENTITYBUG) aNode.setAttribute("value", this.convertEntity(aNode.getAttribute("value")));
					}
					break;
				case "link":
					// could containt urls (icon, stylesheet and fontdef)
					// We have to remove nodes with the stylesheet attribute because they will be added later
					if(!aNode.hasAttribute("rel")) return aNode;
					if (aNode.getAttribute("rel").toLowerCase() == "stylesheet"
							&& (aNode.hasAttribute("href") && aNode.getAttribute("href").indexOf("chrome://") == -1)) {
						return wpdCommon.removeNodeFromParent(aNode);
					} else if (aNode.getAttribute("rel").toLowerCase() == "shortcut icon"
							|| aNode.getAttribute("rel").toLowerCase() == "icon") {
						var aFileName = this.download(aNode.href, true);
						// Changed by Dan S. for Zotero -- see this.repairRelativeLinks()
						if (aFileName) aNode.setAttribute("href", this.relativeLinkFix(aFileName));
					} else if (aNode.getAttribute("rel").toLowerCase() == "fontdef") {
						var aFileName = this.download(aNode.src, true);
						// Changed by Dan S. for Zotero -- see this.repairRelativeLinks()
						if (aFileName) aNode.setAttribute("src", this.relativeLinkFix(aFileName));
					} else {
						aNode.setAttribute("href", aNode.href);
					}
					break;
				case "style":
					return wpdCommon.removeNodeFromParent(aNode);
					break;
				case "applet":
					if (aNode.hasAttribute("code")) aNode.setAttribute("code", "");
					if (aNode.hasAttribute("codebase")) aNode.setAttribute("codebase", "");
					if (aNode.hasAttribute("archive")) aNode.setAttribute("archive", "");
					break;
				case "script":
					if (this.option["script"]) {
						if (aNode.hasAttribute("src")) {
							var aFileName = this.download(aNode.src, true);
							// Changed by Dan S. for Zotero -- see this.repairRelativeLinks()
							if (aFileName) aNode.setAttribute("src", this.relativeLinkFix(aFileName));
						}
					} else {
						if (WPD_JAVASCRIPTSRCBUG && aNode.hasAttribute("src")) {
							//if ( aNode.getAttribute("src").indexOf("http://")!=-1 ) {
							//  aNode.setAttribute("src", "http://0.0.0.0");
							//} else {
							aNode.setAttribute("src", "");
							//}
						} else {
							return wpdCommon.removeNodeFromParent(aNode);
						}
					}
					break;
				case "noscript":
					if (!WPD_JAVASCRIPTSRCBUG) return wpdCommon.removeNodeFromParent(aNode);
					break;
				case "a":
				case "area":
					if (aNode.hasAttribute("onclick")) aNode = this.normalizeJavaScriptLink(aNode, "onclick");
					if (!aNode.hasAttribute("href")) return aNode;
					if (aNode.target == "_blank") aNode.setAttribute("target", "_top");
					if (aNode.href.match(/^javascript:/i)) aNode = this.normalizeJavaScriptLink(aNode, "href");
					if (!this.selection && aNode.getAttribute("href").charAt(0) == "#") return aNode;
					// download file depending on option settings and file extension
					if (this.checkFileTypeOptions(aNode.href)) {
						var aFileName = this.download(aNode.href, true);
						if (aFileName) aNode.setAttribute("href", aFileName);
					} else {
						aNode.setAttribute("href", aNode.href);
					}
					break;
				case "form":
					aNode.setAttribute("action", wpdCommon.resolveURL(this.currentURL, aNode.action));
					break;
				case "meta":
					if ((aNode.hasAttribute("http-equiv") && aNode.hasAttribute("content")) && (aNode.getAttribute("http-equiv").toLowerCase() == "content-type") && (aNode.getAttribute("content").match(/charset\=/i))) {
						// we remove possible charset definitions because they will be added later
						return wpdCommon.removeNodeFromParent(aNode);
					}
					if ((aNode.hasAttribute("http-equiv") && aNode.hasAttribute("content")) && (aNode.getAttribute("http-equiv").toLowerCase() == "refresh") && (aNode.getAttribute("content").match(/URL\=/i))) {
						// there should be no refresh present - could be a noframe relict...
						// (is already processed or timer is longer...)
						return wpdCommon.removeNodeFromParent(aNode);
					}
					break;
				case "base":
					//<BASE HREF="http://www.amin.org/look/amin/">
					// we need to set the base url to currenturl
					if (aNode.hasAttribute("href") && (aNode.getAttribute("href") != "")) this.currentURL = aNode.getAttribute("href");
					return wpdCommon.removeNodeFromParent(aNode);
					break;
				case "frame":
				case "iframe":
					// normal and embedded frames (iframe) -> call "saveDocumentEx" for saving the frame document
					try {
						// we don't have to worry about the currentURL - saveDocumentEx will set the
						// currentURL to the URL of the frame document and afterwards back to the baseURL
						if (this.frameNumber < this.frameList.length) {
							var newFileName = this.saveDocumentEx(this.frameList[this.frameNumber++].document, this.name + "_" + this.frameNumber);
							aNode.setAttribute("src", this.relativeLinkFix(newFileName));
						}
					} catch (ex) {
						wpdCommon.addError("[wpdCommon.processDOMNode]:\n -> aNode.nodeName: " + aNode.nodeName, ex);
					}
					break;
				case "xmp":
					// TO DO
					var pre = aNode.ownerDocument.createElement("pre");
					pre.appendChild(aNode.firstChild);
					aNode.parentNode.replaceChild(pre, aNode);
					break;
			}
			if (!this.option["format"]) {
				aNode.removeAttribute("style");
			} else if (aNode.style && aNode.style.cssText) {
				var newCSStext = this.processCSSText(aNode.style.cssText, this.currentURL, true);
				if (newCSStext) aNode.setAttribute("style", newCSStext);
			}
			if (!this.option["script"]) {
				aNode.removeAttribute("onmouseover");
				aNode.removeAttribute("onmouseout");
				aNode.removeAttribute("onload");
			}
		} catch (ex) {
			wpdCommon.addError("[wpdDOMSaver.processDOMNode]:\n -> aNode.nodeName: " + aNode.nodeName, ex);
		}
		return aNode;
	},


	// get through the DOM tree (recursiv function)
	processDOMRecursively: function (rootNode) {
		if (rootNode == null) return;
		for (var curNode = rootNode.firstChild; curNode != null; curNode = curNode.nextSibling) {
			if (curNode.nodeName != "#text" && curNode.nodeName != "#comment") {
				curNode = this.processDOMNode(curNode);
				this.processDOMRecursively(curNode);
			} else if ((curNode.nodeName == "#text") && (wpdCommon.trim(curNode.nodeValue) != "")) {
				// we need to replace special chars with HTML Entities
				if (WPD_ENTITYBUG) curNode.nodeValue = this.convertEntity(curNode.nodeValue);
				// if we have CRLFs before or after the text "innerhtml" will remove them,
				// so we have to make sure that we preserve this CRLFs for the PRE Tag
				if (WPD_CRLFBUG) curNode.nodeValue = wpdCommon.checkCRLF(curNode);
			}
		}
	},

	// Do a correction directly inside the final HTML text.
	// This is necessary because setting the css text for the
	// style attribute does not work - innerHTML will finally
	// generate e.g "repeat scroll 0%;" regardless of the style setting
	// (e.g. "repeat;")
	repairInlineCSS: function (aHTMLText) {
		if ((WPD_CSSSCROLLBUG) && (aHTMLText.match(/background:/i))) {
			// Regex fixed by Dan for Zotero
			//var re = new RegExp(/style=\"(.*)background:(.*)(repeat scroll 0(?:pt|px|%);)/);
			var re = new RegExp(/style=\"([^\"]*)background:([^;\"]*)(repeat scroll 0(?:pt|px|%);?)/);
			while (re.exec(aHTMLText)) {
				var firstPart = RegExp.$1;
				var secondPart = RegExp.$2;
				// '?' added by Dan for Zotero
				//var thirdPart = RegExp.$3.replace(/scroll 0(pt|px|%);/g, ';');
				var thirdPart = RegExp.$3.replace(/scroll 0(pt|px|%);?/g, ';');
				aHTMLText = aHTMLText.replace(re, "style=\"" + firstPart + "background:" + secondPart + thirdPart);
			}
		}
		if ((WPD_CSSBACKGROUNDPOSITIONBUG) && (aHTMLText.match(/background-position: /i))) {
			// Regex fixed by Dan for Zotero
			//var re = new RegExp(/style=\"(.*)background-position: 0(?:pt|px|%);/);
			var re = new RegExp(/style=\"([^\"]*)background-position: 0(?:pt|px|%);/);
			while (re.exec(aHTMLText)) {
				aHTMLText = aHTMLText.replace(re, "style=\"" + RegExp.$1 + "background-position: ;");
			}
		}
		return aHTMLText;
	},

	// While we're replacing references with local file paths,
	// we don't want to have the browser try and fetch them
	// We prefix them with 'about:blank?' and remove later via repairRelativeLinks
	relativeLinkFix: function (aFileName) {
		return "about:blank?" + aFileName;
	},

	// Added by Dan S. for Zotero to restore relative links,
	// which are prepended with "about:blank?" to fix a bug in Scrapbook/WPD
	// that sending an invalid request to the server when the img src
	// is a relative link to a file in a different directory
	repairRelativeLinks: function (aHTMLText) {
		return aHTMLText.replace(/(src|background|data|href)="about:blank\?([^"]*)"/g, '$1="$2"');
	},


	// process the CSS text of one stylesheet element
	processCSSText: function (aCSStext, aCSShref, inline) {
		if (!aCSStext) return "";

		// search for "url" entries inside the css
		// Double-quotes in regexp added by Dan S. for Zotero
		var re = new RegExp(/ url\("?([^'")]+)"?\)/);
		var i = 0;
		while (aCSStext.match(re)) {
			if (++i > 20) break; // safer (we try it maximal 20 times for one stylesheet element)
			var imgFile = this.download(wpdCommon.resolveURL(aCSShref, RegExp.$1), true);
			aCSStext = aCSStext.replace(re, " url('" + imgFile + "')");
		}

		// search for "content" entries inside the css and clean "attr"
		re = new RegExp(/ content: \"(.*?)\"; /);
		if (aCSStext.match(re)) {
			var innerQuote = RegExp.$1;
			innerQuote = innerQuote.replace(/\"/g, '\\"');
			innerQuote = innerQuote.replace(/\\\" attr\(([^\)]+)\) \\\"/g, '" attr($1) "');
			aCSStext = aCSStext.replace(re, ' content: "' + innerQuote + '"; ');
		}

		//
		if ((WPD_CSSSCROLLBUG) && (aCSStext.match(/background: /i))) aCSStext = aCSStext.replace(/ scroll 0(pt|px|%);/g, ";");
		if ((WPD_CSSBACKGROUNDPOSITIONBUG) && (aCSStext.match(/background-position: /i))) aCSStext = aCSStext.replace(/ background-position: 0(pt|px|%);/g, ";");
		return aCSStext;
	},

	// process the CSS stylesheets (recursively)
	// CSS Types:
	// UNKNOWN_RULE = 0,
	// STYLE_RULE = 1,
	// CHARSET_RULE = 2,
	// IMPORT_RULE = 3,
	// MEDIA_RULE = 4,
	// FONT_FACE_RULE = 5,
	// PAGE_RULE = 6
	processCSSRecursively: function (aCSS) {
		if (!aCSS || aCSS.disabled) return "";
		var content = "";
		var medium = aCSS.media.mediaText;
		if (medium != "" && medium.indexOf("screen") < 0 && medium.indexOf("all") < 0) {
			return "";
		}
		// Disabled by Dan S. to fix CSS on snapshots of Reader View
		//if (aCSS.href != null && aCSS.href.indexOf("chrome") == 0) return "";
		var flag = "";

		// Added by Dan S. for Zotero
		//
		// Make sure cssRules is accessible -- it might not be if a <link>
		// element appears within <body> instead of <head>
		try {
			aCSS.cssRules
		} catch (e) {
			var msg = "Unable to access cssRules property of " + aCSS.href + " in wpdDOMSaver.processCSSRecursively()";
			Zotero.debug("WebPageDump: " + msg, 2);
			Components.utils.reportError(msg);
			return "";
		}

		for (var i = 0; i < aCSS.cssRules.length; i++) {
			if (aCSS.cssRules[i].type == 1 || aCSS.cssRules[i].type == 4) {
				if (flag == "") {
					content += "\n/* ::::: " + aCSS.href + " ::::: */\n\n"; // write original css filename
					flag = aCSS.href;
				}
				var ref = aCSS.href;
				if (flag == null || flag.indexOf(".css") == -1) ref = this.currentURL;
				content += this.processCSSText(aCSS.cssRules[i].cssText, ref, false) + "\n";
			} else if (aCSS.cssRules[i].type == 3) {
				content += this.processCSSRecursively(aCSS.cssRules[i].styleSheet);
			}
		}
		return content;
	},

	//given a file name and source URL (optional) with content (optional)
	//returns a unique file name and registers it
	getUniqueFileNameAndRegister: function(fileName, sourceURL, content) {
		fileName = this.checkForEqualFilenames(
			wpdCommon.getValidFileName(fileName),
			sourceURL);
		this.registerFile(fileName, sourceURL, content);
		return fileName;
	},

	//register filename, so we don't overwrite them later
	registerFile: function (newFileName, sourceURL, content) {
		this.fileInfo[newFileName.toLowerCase()] = {
			url: sourceURL,
			downloaded: content
		}
	},

	// is the file registered (e.g. downloaded)?
	isFileRegistered: function (newFileName) {
		if (this.fileInfo[newFileName.toLowerCase()] != undefined) return true;
		return false;
	},

	isDownloaded: function(fileName) {
		fileName = fileName.toLowerCase();
		if(!this.fileInfo[fileName]) return;
		return this.fileInfo[fileName].downloaded;
	},

	// check for equal Filenames with different locations
	// if this is the case, we generate a new name
	// if no aURLSpec is passed, this generates a unique file name
	checkForEqualFilenames: function (newFileName, aURLSpec) {
		if (this.isFileRegistered(newFileName)) {
			if (!aURLSpec || this.fileInfo[newFileName.toLowerCase()]["url"] != aURLSpec) {
				// the file is already registered but from a different location
				// => probably not the same file, so we have to find a different name it (e.g. filename_001.ext)
				var seq = 1;
				var fileLR = wpdCommon.splitFileName(newFileName);
				if (!fileLR[1]) fileLR[1] = "dat";
				newFileName = fileLR[0] + "_" + wpdCommon.addLeftZeros(seq++, 3) + "." + fileLR[1];
				while (this.fileInfo[newFileName.toLowerCase()] != undefined) {
					// is the file already registered with the new name?
					if (aURLSpec && this.fileInfo[newFileName.toLowerCase()]["url"] == aURLSpec) return newFileName; // Yes -> so it's already downloaded and we are finished
					newFileName = fileLR[0] + "_" + wpdCommon.addLeftZeros(seq++, 3) + "." + fileLR[1]; // No -> "increment" filename
				}
			}
		}
		return newFileName;
	},

	// Download the specified URL to "this.currentDir". Takes
	// care about equal filenames from different locations
	download: function (aURLSpec, aDownload) {
		if (!aURLSpec) return "";

		// is this a relative URL (no protocol present) which needs to be resolved?
		if (aURLSpec.indexOf("://") < 0) aURLSpec = wpdCommon.resolveURL(this.currentURL, aURLSpec);

		try {
			var aURL = wpdCommon.convertURLToObject(aURLSpec);

			// generate a filename
			var newFileName = aURL.fileName;
			if (!newFileName) newFileName = "untitled";
			// same name but different location?
			newFileName = this.getUniqueFileNameAndRegister(newFileName, aURLSpec);
			// is the file already registered (processed) ?
			if (!this.isDownloaded(newFileName)) {
				if (aDownload) {
					aDownload = wpdCommon.downloadFile(aURLSpec, this.currentDir + newFileName);
				} else {
					aDownload = true;
				}
				this.registerFile(newFileName, aURLSpec, aDownload);
			}
			return newFileName;
		} catch (ex) {
			wpdCommon.addError("[wpdDOMSaver.download]\n -> aURLSpec: " + aURLSpec, ex);
			return "";
		}
	},

	// Get a CSS filename node for inserting in the DOM Tree
	createCSSFileNode: function (aDocument, rootNode, aFileName) {
		var newLinkNode = aDocument.createElement("link");

		rootNode.firstChild.appendChild(aDocument.createTextNode("\n"));

		newLinkNode.setAttribute("media", "all");
		newLinkNode.setAttribute("href", aFileName);
		newLinkNode.setAttribute("type", "text/css");
		newLinkNode.setAttribute("rel", "stylesheet");

		rootNode.firstChild.appendChild(newLinkNode);

		rootNode.firstChild.appendChild(aDocument.createTextNode("\n"));
		//return newLinkNode;
	},

	// Creates a placeholder node for inserting the DOCTYPE after the html tag
	createPseudeDocTypeNode: function (aDocument, rootNode) {
		var aDoctype = aDocument.doctype;
		if (!aDoctype) return;
		try {
			rootNode.insertBefore(aDocument.createTextNode("\n"), rootNode.firstChild);

			var metaNode = aDocument.createElement("wpd_doctype");
			rootNode.insertBefore(metaNode, rootNode.firstChild);

			rootNode.insertBefore(aDocument.createTextNode("\n"), rootNode.firstChild);
		} catch (ex) {
			wpdCommon.addError("[wpdDOMSaver.createDocTypeNode]", ex);
		}
	},

	// replaces the placeholder node generated by createPseudeDocTypeNode with the DOCTYPE
	replaceDocType: function (aDocument, aHTMLText) {
		var aDoctype = aDocument.doctype;
		if (!aDoctype) return aHTMLText;
		try {
			return aHTMLText.replace("<wpd_doctype></wpd_doctype>", this.getDocType(aDocument));
		} catch (ex) {
			wpdCommon.addError("[wpdDOMSaver.replaceDocType]", ex);
		}
		return aHTMLText;
	},

	// Returns the HTML Text generated from rootNode and does
	// some processing (WPD_DOCTYPEBUG, WPD_ENTITYBUG, cleaning,...)
	generateHTMLString: function (aDocument, rootNode) {
		if (WPD_DOCTYPEBUG) this.createPseudeDocTypeNode(aDocument, rootNode);
		var HTMLText = wpdCommon.nodeToHTMLString(rootNode);
		if (WPD_DOCTYPEBUG) HTMLText = this.replaceDocType(aDocument, HTMLText);
		// adding the doctype entry at the top
		HTMLText = this.getDocType(aDocument) + HTMLText;
		HTMLText = HTMLText.replace(/\x00/g, " ");
		// replace the &amp; added by the innerHTML method
		// because we have already generated all entities
		if (WPD_ENTITYBUG) HTMLText = HTMLText.replace(/&amp;/g, "&");

		// Added by Dan S. for Zotero
		HTMLText = this.repairRelativeLinks(HTMLText);

		return this.repairInlineCSS(HTMLText);
	},

	// Returns a DOCTYPE definition string based on aDocument.doctype
	getDocType: function (aDocument) {
		var aDoctype = aDocument.doctype;
		if (!aDoctype) return "";
		var dt = "<!DOCTYPE " + aDoctype.name;
		if (aDoctype.publicId) dt += ' PUBLIC "' + aDoctype.publicId + '"';
		if (aDoctype.systemId) dt += ' "' + aDoctype.systemId + '"';
		dt += ">\n";
		return dt;
	},

	// Get the meta charset information from the document
	getMetaCharset: function (aDocument) {
		var metas = aDocument.getElementsByTagName("meta");
		for (var i = metas.length; --i >= 0;) {
			var meta = metas[i];
			if (/content-type/i.test(meta.httpEquiv)) {
				r = /^text\/html; *charset=(.*)$/i.exec(meta.content);
				return r[1];
			}
		}
		return "";
	},


	// Create and return a meta charset node for the DOM Tree
	createMetaCharsetNode: function (aDocument, rootNode, aContentType, aCharSet) {
		try {
			var metaNode = aDocument.createElement("meta");
			rootNode.firstChild.insertBefore(aDocument.createTextNode("\n"), rootNode.firstChild.firstChild);

			metaNode.setAttribute("content", aContentType + "; charset=" + aCharSet);
			metaNode.setAttribute("http-equiv", "Content-Type");

			rootNode.firstChild.insertBefore(metaNode, rootNode.firstChild.firstChild);

			rootNode.firstChild.insertBefore(aDocument.createTextNode("\n"), rootNode.firstChild.firstChild);
		} catch (ex) {
			wpdCommon.addError("[wpdDOMSaver.createMetaCharsetNode]", ex);
		}
	},

	// get a meta node for the DOM Tree
	createMetaNameNode: function (aDocument, rootNode, name, content) {
		try {
			var metaNode = aDocument.createElement("meta");

			metaNode.setAttribute("content", content);
			metaNode.setAttribute("name", name);

			rootNode.firstChild.insertBefore(aDocument.createTextNode("\n"), rootNode.firstChild.firstChild);
			rootNode.firstChild.insertBefore(metaNode, rootNode.firstChild.firstChild);
		} catch (ex) {
			wpdCommon.addError("[wpdDOMSaver.createMetaNameNode]", ex);
		}
	},

	/*existMetaCharsetNode : function(aDocument);
	{
	  var metaNodes = aDocument.getElementsByTagName("meta");
	  for (var i=0; i<metaNodes.length; i++ ) {
      if ( (metaNodes[i].hasAttribute("http-equiv") && metaNodes[i].hasAttribute("content")) &&
			     (metaNodes[i].getAttribute("http-equiv").toLowerCase() == "content-type") &&
				   (metaNodes[i].getAttribute("content").match(/charset\=/i)) )
						return true;
	  }
	  return false;
	}*/


	// Return the WPD Meta Base URL Information from aFile
	getMetaBaseURL: function (aFile) {
		if (wpdCommon.pathExists(aFile)) {
			str = new String(wpdCommon.readFile(aFile, false, true));
			re = new RegExp(/<meta name=\"wpd_baseurl\" content=\"(.*?)\">/);
			if (str.match(re)) {
				return RegExp.$1;
			}
		}
		return "";
	},

	// Return the WPD Meta Date Information from aFile
	getMetaDate: function (aFile) {
		if (wpdCommon.pathExists(aFile)) {
			str = new String(wpdCommon.readFile(aFile, false, true));
			re = new RegExp(/<meta name=\"wpd_date\" content=\"(.*?)\">/);
			if (str.match(re)) {
				return RegExp.$1;
			}
		}
		return "";
	},

	// creates the meta nodes for the wpd meta tags (version, baseurl, url, date/time)
	createMetaInformation: function (aDocument, rootNode) {
		// insert url/date/time meta information
		//
		var d = this.dateObj.getUTCFullYear() + "-" + wpdCommon.addLeftZeros(this.dateObj.getUTCMonth(), 2) + "-" + wpdCommon.addLeftZeros(this.dateObj.getUTCDate(), 2);
		d = d + "T" + wpdCommon.addLeftZeros(this.dateObj.getUTCHours(), 2) + ":" + wpdCommon.addLeftZeros(this.dateObj.getUTCMinutes(), 2) + "Z";
		this.createMetaNameNode(aDocument, rootNode, "wpd_date", d);
		this.createMetaNameNode(aDocument, rootNode, "wpd_url", this.currentURL);
		this.createMetaNameNode(aDocument, rootNode, "wpd_baseurl", this.baseURL);
		this.createMetaNameNode(aDocument, rootNode, "wpd_version", WPD_VERSION);
		rootNode.firstChild.insertBefore(aDocument.createTextNode("\n\n"), rootNode.firstChild.firstChild);
	},

	// save a non HTML "aDocument" as "aFileName" and generate a
	// wrapper HTML File which references "aDocument"
	// ("aFileName" is the filename without(!) extension)
	saveDocumentFile: function (aDocument, aFileName) {
		Zotero.debug("[wpdDOMSaver.saveDocumentFile]: Saving file from " + this.currentURL);
		aFileName = this.download(this.currentURL, true)
		Zotero.debug("[wpdDOMSaver.saveDocumentFile]: Saved to " + aFileName);

		return aFileName;
		/* Wrapper file disabled by Dan S. for Zotero
		var aFileURL = aDocument.location.href;

		if ( !aFileName ) aFileName = "file" + Math.random().toString();
		// this.download will generate a unique filename
		var newFileName = this.download(this.currentURL,true);

		if ( aDocument.contentType.substring(0,5) == "image" ) {
			var HTMLText = '<html><body><img src="' + newFileName + '"></body></html>';
		} else {
			var HTMLText = '<html><head><meta http-equiv="refresh" content="0;URL=' + newFileName + '"></head><body></body></html>';
		}

		var HTMLFile = this.currentDir + aFileName + ".html";

		if (!wpdCommon.writeFile(HTMLText,HTMLFile))
		  wpdCommon.addError("[wpdDOMSaver.saveDocumentFile]: could not write HTML wrapper for "+aFileName+"\n");

		return aFileName + ".html";
		*/
	},

	// save the CSS Stylesheets of "aDocument" as "aFileName" and
	// process the CSS Text
	// "aFileName" is the filename without(!) extension
	// (".css" will be added)
	saveDocumentCSS: function (aDocument, aFileName) {
		var CSSText = ""; //"body {display: block;margin: 8px;}; ";
		if (this.option["format"]) {
			var myStyleSheets = aDocument.styleSheets;
			// get all style sheets to "CSSText"
			for (var i = 0; i < myStyleSheets.length; i++) {
				CSSText += this.processCSSRecursively(myStyleSheets[i]);
			}
			if (CSSText) {
				// don't forget to convert the CSS String to the document charset..
				// (necessary for e.g. font-family)
				if (this.option["encodeUTF8"]) {
					CSSText = wpdCommon.ConvertFromUnicode16(CSSText, "UTF-8");
				} else {
					CSSText = wpdCommon.ConvertFromUnicode16(CSSText, this.curCharacterSet);
				}
				aFileName = this.getUniqueFileNameAndRegister(aFileName + ".css");
				Zotero.debug("[wpdDOMSaver.saveDocumentCSS]: " + this.currentDir + aFileName);
				// write css file
				var CSSFile = this.currentDir + aFileName;
				if (!wpdCommon.writeFile(CSSText, CSSFile)) wpdCommon.addError("[wpdDOMSaver.saveDocumentCSS]: could not write CSS File");
				return aFileName;
			}
		}
		return false;
	},

	// save the HTML "aDocument" as "aFileName" and process the
	// DOM Tree (see processDOMNode) - calls also saveDocumentCSS
	// "aFileName" is the filename without(!) extension
	// (".html" will be added)
	saveDocumentHTML: function (aDocument, aFileName) {
		aFileName = this.getUniqueFileNameAndRegister(aFileName + ".html");
		var aFileNameNoExt = wpdCommon.splitFileName(aFileName)[0];

		Zotero.debug("[wpdDOMSaver.saveDocumentHTML]: " + this.currentDir + aFileName);

		this.curDocument = aDocument;
		this.curCharacterSet = aDocument.characterSet;
		var charset = this.curCharacterSet;
		// we get the html node without childs and add the head and body trees
		// manually so we are sure that we have a correct html file
		var rootNode = aDocument.getElementsByTagName("html")[0].cloneNode(false);

		try {
			var headNode = aDocument.getElementsByTagName("head")[0].cloneNode(true);
			rootNode.appendChild(headNode);
			rootNode.appendChild(aDocument.createTextNode("\n"));
		} catch (ex) {}
		try {
			this.curBody = aDocument.body.cloneNode(true);
		} catch (ex) {
			this.curBody = aDocument.getElementsByTagName("body")[0].cloneNode(true);
		}
		rootNode.appendChild(this.curBody);
		rootNode.appendChild(aDocument.createTextNode("\n"));

		// now the processing of the dom nodes (changing hrefs, downloading...)
		this.processDOMRecursively(rootNode);

		// write css file and add css node with the new css filename in the DOM Tree
		var cssFileName = this.saveDocumentCSS(aDocument, aFileNameNoExt);
		if (cssFileName) this.createCSSFileNode(aDocument, rootNode, cssFileName);

		// create meta information (version, base_url, url, date/time)
		if (this.option["metainfo"]) this.createMetaInformation(aDocument, rootNode);

		// add the charset defintions previously removed by processDOMNode
		if (this.option["encodeUTF8"]) {
			this.createMetaCharsetNode(aDocument, rootNode, aDocument.contentType, "UTF-8");
		} else {
			// charset probably sent by web server only -> add the charset meta header for local viewing
			this.createMetaCharsetNode(aDocument, rootNode, aDocument.contentType, charset);
		}

		// convert the nodes to a html string (including some processing)

		// "var " added by Dan S. for Zotero
		var HTMLText = this.generateHTMLString(aDocument, rootNode);
		// convert the DOM String to the desired Charset
		if (this.option["encodeUTF8"]) {
			HTMLText = wpdCommon.ConvertFromUnicode16(HTMLText, "UTF-8");
		} else {
			HTMLText = wpdCommon.ConvertFromUnicode16(HTMLText, charset);
		}

		this.curCharacterSet = charset;

		// and write the file...
		var HTMLFile = this.currentDir + aFileName;
		if (!wpdCommon.writeFile(HTMLText, HTMLFile)) wpdCommon.addError("[wpdDOMSaver.saveDocumentHTML]: could not write HTML File");

		return aFileName;
	},

	// Decides the calling of SaveDocumentFile or saveDocumentHTML
	saveDocumentEx: function (aDocument, aFileName) {
		// we have to set a new current url which is the
		// base reference url (necessary for frame processing)
		this.currentURL = aDocument.location.href;

		// distinguish between HTML Documents and other
		// embedded files like flash, video or images...
		if ((aDocument.getElementsByTagName("head").length == 0) || !aDocument.contentType.match(/htm|html|xml/i)) {
			aFileName = this.saveDocumentFile(aDocument, aFileName);
		} else {
			aFileName = this.saveDocumentHTML(aDocument, aFileName)
		}

		// set the current URL back to the original base URL
		this.currentURL = this.baseURL;

		return aFileName;

	},

	// Main Routine: call it for saving the actual active top window
	// (be sure to call the init function at the top of this file before)
	saveHTMLDocument: function () {
		try {
			return this.saveDocumentEx(this.document, this.name);
		} catch (ex) {
			wpdCommon.addError("[wpdDOMSaver.saveHTMLDocument]", ex);
		}
	}

};