From 045780d9acf2c2ca7cac4ac5209bdf6bc8c8a681 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Tue, 5 Sep 2006 07:51:55 +0000 Subject: [PATCH] closes #250, figure out proper text encodings for import/export MODS uses the encoding as specified in the ; var item; @@ -4115,15 +4116,39 @@ function doImport() { var text = ""; var read; + // read until we see if the file begins with a parse instruction + read = " "; + while(read == " " || read == "\n" || read == "\r") { + read = Scholar.read(1); + } + + var firstPart = read + Scholar.read(4); + if(firstPart == "")) { + read = Scholar.read(1); + firstPart += read; + } + var encodingRe = /encoding=[''"]([^''"]+)[''"]/; + var m = encodingRe.exec(firstPart); + // set character set + try { + Scholar.setCharacterSet(m[1]); + } catch(e) { + Scholar.setCharacterSet("utf-8"); + } + } else { + Scholar.setCharacterSet("utf-8"); + text += firstPart; + } + // read in 16384 byte increments while(read = Scholar.read(16384)) { text += read; } Scholar.Utilities.debug("read in"); - // eliminate heading so we can parse as XML - text = text.replace(/<\?xml[^?]+\?>/, ""); - // parse with E4X var m = new Namespace("http://www.loc.gov/mods/v3"); // why does this default namespace declaration not work!? @@ -5495,7 +5520,9 @@ function processTag(item, tag, value) { } function doImport(attachments) { - Scholar.Utilities.debug("hello"); + // this is apparently the proper character set for RIS, although i''m not + // sure how many people follow this + Scholar.setCharacterSet("IBM850"); var line = true; var tag = data = false; @@ -5560,6 +5587,10 @@ function addTag(tag, value) { } function doExport() { + // this is apparently the proper character set for RIS, although i''m not + // sure how many people follow this + Scholar.setCharacterSet("IBM850"); + var item; while(item = Scholar.nextItem()) { @@ -5974,6 +6005,9 @@ function doImport() { var text; var holdOver = ""; // part of the text held over from the last loop + Scholar.Utilities.debug("doing import: about to set character set"); + Scholar.setCharacterSet("utf-8"); + while(text = Scholar.read(4096)) { // read in 4096 byte increments var records = text.split("\x1D");