This repository has been archived on 2019-08-30. You can view files and clone it, but cannot push or open issues or pull requests.
epub/epub.js
2011-06-12 21:47:48 +03:00

577 lines
18 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

var ZipFile = require("zipfile").ZipFile,
XML2JS = require("xml2js").Parser,
utillib = require("util"),
EventEmitter = require('events').EventEmitter;
/**
* new EPub(fname[, imageroot][, linkroot])
* - fname (String): filename for the ebook
* - imageroot (String): URL prefix for images
* - linkroot (String): URL prefix for links
*
* Creates an Event Emitter type object for parsing epub files
*
* var epub = new EPub("book.epub");
* epub.on("end", function(){ ... });
* epub.parse();
**/
function EPub(fname, imageroot, linkroot){
EventEmitter.call(this);
this.filename = fname;
this.imageroot = (imageroot || "/images/").trim();
this.linkroot = (linkroot || "/links/").trim();
if(this.imageroot.substr(-1)!="/")this.imageroot+="/";
if(this.linkroot.substr(-1)!="/")this.linkroot+="/";
}
utillib.inherits(EPub, EventEmitter);
/**
* EPub#parse() -> undefined
*
* Starts the parser, needs to be called by the script
**/
EPub.prototype.parse = function(){
this.containerFile = false;
this.mimeFile = false;
this.rootFile = false;
this.metadata = {};
this.manifest = {};
this.spine = {toc: false, contents:[]};
this.toc = [];
this.open();
}
/**
* EPub#open() -> undefined
*
* Opens the epub file with Zip unpacker, retrieves file listing
* and runs mime type check
**/
EPub.prototype.open = function(){
try{
this.zip = new ZipFile(this.filename);
}catch(E){
this.emit("error", new Error("Invalid/missing file"));
return;
}
if(!this.zip.names || !this.zip.names.length){
this.emit("error", new Error("No files in archive"));
return;
}
this.checkMimeType();
}
/**
* EPub#checkMimeType() -> undefined
*
* Checks if there's a file called "mimetype" and that it's contents
* are "application/epub+zip". On success runs root file check.
**/
EPub.prototype.checkMimeType = function(){
for(var i=0, len = this.zip.names.length; i<len; i++){
if(this.zip.names[i].toLowerCase() == "mimetype"){
this.mimeFile = this.zip.names[i];
break;
}
}
if(!this.mimeFile){
this.emit("error", new Error("No mimetype file in archive"));
return;
}
this.zip.readFile(this.mimeFile, (function(err, data){
if(err){
this.emit("error", new Error("Reading archive failed"));
return;
}
var txt = data.toString("utf-8").toLowerCase().trim();
if(txt != "application/epub+zip"){
this.emit("error", new Error("Unsupported mime type"));
return;
}
this.getRootFiles();
}).bind(this));
}
/**
* EPub#getRootFiles() -> undefined
*
* Looks for a "meta-inf/container.xml" file and searches for a
* rootfile element with mime type "application/oebps-package+xml".
* On success calls the rootfile parser
**/
EPub.prototype.getRootFiles = function(){
for(var i=0, len = this.zip.names.length; i<len; i++){
if(this.zip.names[i].toLowerCase() == "meta-inf/container.xml"){
this.containerFile = this.zip.names[i];
break;
}
}
if(!this.containerFile){
this.emit("error", new Error("No container file in archive"));
return;
}
this.zip.readFile(this.containerFile, (function(err, data){
if(err){
this.emit("error", new Error("Reading archive failed"));
return;
}
var xml = data.toString("utf-8").toLowerCase().trim(),
xmlparser = new XML2JS();
xmlparser.on("end", (function(result){
if(!result.rootfiles || !result.rootfiles.rootfile){
this.emit("error", new Error("No rootfiles found"));
return;
}
var rootfile = result.rootfiles.rootfile,
filename = false;
if(Array.isArray(rootfile)){
for(var i=0, len = rootfile.length; i<len; i++){
if(rootfile[i]["@"]["media-type"] &&
rootfile[i]["@"]["media-type"] == "application/oebps-package+xml" &&
rootfile[i]["@"]["full-path"]){
filename = rootfile[i]["@"]["full-path"].toLowerCase().trim();
break;
}
}
}else if(rootfile["@"]){
if(rootfile["@"]["media-type"] != "application/oebps-package+xml" || !rootfile["@"]["full-path"]){
this.emit("error", new Error("Rootfile in unknown format"));
return;
}
filename = rootfile["@"]["full-path"].toLowerCase().trim();
}
if(!filename){
this.emit("error", new Error("Empty rootfile"));
return;
}
for(var i=0, len = this.zip.names.length; i<len; i++){
if(this.zip.names[i].toLowerCase() == filename){
this.rootFile = this.zip.names[i];
break;
}
}
if(!this.rootFile){
this.emit("error", new Error("Rootfile not found from archive"));
return;
}
this.handleRootFile();
}).bind(this));
xmlparser.on("error", (function(err){
this.emit("error", new Error("Parsing container XML failed"));
return;
}).bind(this));
xmlparser.parseString(xml);
}).bind(this));
}
/**
* EPub#handleRootFile() -> undefined
*
* Parser the rootfile XML and calls rootfile parser
**/
EPub.prototype.handleRootFile = function(){
this.zip.readFile(this.rootFile, (function(err, data){
if(err){
this.emit("error", new Error("Reading archive failed"));
return;
}
var xml = data.toString("utf-8"),
xmlparser = new XML2JS();
xmlparser.on("end", this.parseRootFile.bind(this));
xmlparser.on("error", (function(err){
this.emit("error", new Error("Parsing container XML failed"));
return;
}).bind(this));
xmlparser.parseString(xml);
}).bind(this));
}
/**
* EPub#parseRootFile() -> undefined
*
* Parses elements "metadata," "manifest," "spine" and TOC.
* Emits "end" if no TOC
**/
EPub.prototype.parseRootFile = function(rootfile){
var keys, keyparts, key;
keys = Object.keys(rootfile);
for(var i=0, len = keys.length; i<len; i++){
keyparts = keys[i].split(":");
key = (keyparts.pop() || "").toLowerCase().trim();
switch(key){
case "metadata":
this.parseMetadata(rootfile[keys[i]]);
break;
case "manifest":
this.parseManifest(rootfile[keys[i]]);
break;
case "spine":
this.parseSpine(rootfile[keys[i]]);
break;
case "guide":
//this.parseGuide(rootfile[keys[i]]);
break;
}
}
if(this.spine.toc){
this.parseTOC();
}else
this.emit("end");
}
/**
* EPub#parseMetadata() -> undefined
*
* Parses "metadata" block (book metadata, title, author etc.)
**/
EPub.prototype.parseMetadata = function(metadata){
var keys, keyparts, key;
keys = Object.keys(metadata);
for(var i=0, len = keys.length; i<len; i++){
keyparts = keys[i].split(":");
key = (keyparts.pop() || "").toLowerCase().trim();
switch(key){
case "publisher":
if(Array.isArray(metadata[keys[i]])){
this.metadata.publisher = String(metadata[keys[i]][0] && metadata[keys[i]][0]["#"] || metadata[keys[i]][0] || "").trim();
}else{
this.metadata.publisher = String(metadata[keys[i]]["#"] || metadata[keys[i]] || "").trim();
}
break;
case "language":
if(Array.isArray(metadata[keys[i]])){
this.metadata.language = String(metadata[keys[i]][0] && metadata[keys[i]][0]["#"] || metadata[keys[i]][0] || "").toLowerCase().trim();
}else{
this.metadata.language = String(metadata[keys[i]]["#"] || metadata[keys[i]] || "").toLowerCase().trim();
}
break;
case "title":
if(Array.isArray(metadata[keys[i]])){
this.metadata.title = String(metadata[keys[i]][0] && metadata[keys[i]][0]["#"] || metadata[keys[i]][0] || "").trim();
}else{
this.metadata.title = String(metadata[keys[i]]["#"] || metadata[keys[i]] || "").trim();
}
break;
case "subject":
if(Array.isArray(metadata[keys[i]])){
this.metadata.subject = String(metadata[keys[i]][0] && metadata[keys[i]][0]["#"] || metadata[keys[i]][0] || "").trim();
}else{
this.metadata.subject = String(metadata[keys[i]]["#"] || metadata[keys[i]] || "").trim();
}
break;
case "description":
if(Array.isArray(metadata[keys[i]])){
this.metadata.description = String(metadata[keys[i]][0] && metadata[keys[i]][0]["#"] || metadata[keys[i]][0] || "").trim();
}else{
this.metadata.description = String(metadata[keys[i]]["#"] || metadata[keys[i]] || "").trim();
}
break;
case "creator":
if(Array.isArray(metadata[keys[i]])){
this.metadata.creator = String(metadata[keys[i]][0] && metadata[keys[i]][0]["#"] || metadata[keys[i]][0] || "").trim();
this.metadata.creatorFileAs = String(metadata[keys[i]][0] && metadata[keys[i]][0]['@'] && metadata[keys[i]][0]['@']["opf:file-as"] || this.metadata.creator).trim();
}else{
this.metadata.creator = String(metadata[keys[i]]["#"] || metadata[keys[i]] || "").trim();
this.metadata.creatorFileAs = String(metadata[keys[i]]['@'] && metadata[keys[i]]['@']["opf:file-as"] || this.metadata.creator).trim();
}
break;
case "date":
if(Array.isArray(metadata[keys[i]])){
this.metadata.date = String(metadata[keys[i]][0] && metadata[keys[i]][0]["#"] || metadata[keys[i]][0] || "").trim();
}else{
this.metadata.date = String(metadata[keys[i]]["#"] || metadata[keys[i]] || "").trim();
}
break;
case "identifier":
if(metadata[keys[i]]["@"] && metadata[keys[i]]["@"]["opf:scheme"]=="ISBN"){
this.metadata.ISBN = String(metadata[keys[i]]["#"] || "").trim();
}else if(metadata[keys[i]]["@"] && metadata[keys[i]]["@"]["id"] && metadata[keys[i]]["@"]["id"].match(/uuid/i)){
this.metadata.UUID = String(metadata[keys[i]]["#"] || "").replace('urn:uuid:','').toUpperCase().trim();
}else if(Array.isArray(metadata[keys[i]])){
for(var j=0; j<metadata[keys[i]].length; j++){
if(metadata[keys[i]][j]["@"]){
if(metadata[keys[i]][j]["@"]["opf:scheme"]=="ISBN"){
this.metadata.ISBN = String(metadata[keys[i]][j]["#"] || "").trim();
}else if(metadata[keys[i]][j]["@"]["id"] && metadata[keys[i]][j]["@"]["id"].match(/uuid/i)){
this.metadata.UUID = String(metadata[keys[i]][j]["#"] || "").replace('urn:uuid:','').toUpperCase().trim();
}
}
}
}
break;
}
}
}
/**
* EPub#parseManifest() -> undefined
*
* Parses "manifest" block (all items included, html files, images, styles)
**/
EPub.prototype.parseManifest = function(manifest){
var path = this.rootFile.split("/"), element, path_str;
path.pop();
path_str = path.join("/");
if(manifest.item){
for(var i=0, len = manifest.item.length; i<len; i++){
if(manifest.item[i]['@']){
element = manifest.item[i]['@'];
if(element.href && element.href.substr(0, path_str.length) != path_str){
element.href = path.concat([element.href]).join("/");
}
this.manifest[manifest.item[i]['@'].id] = element;
}
}
}
}
/**
* EPub#parseSpine() -> undefined
*
* Parses "spine" block (all html elements that are shown to the reader)
**/
EPub.prototype.parseSpine = function(spine){
var path = this.rootFile.split("/"), element, path_s
path.pop();
if(spine['@'] && spine['@'].toc){
this.spine.toc = this.manifest[spine['@'].toc] || false;
}
if(spine.itemref){
for(var i=0, len = spine.itemref.length; i<len; i++){
if(spine.itemref[i]['@']){
if(element = this.manifest[spine.itemref[i]['@'].idref]){
this.spine.contents.push(element);
}
}
}
}
}
/**
* EPub#parseTOC() -> undefined
*
* Parses ncx file for table of contents (title, html file)
**/
EPub.prototype.parseTOC = function(){
var path = this.spine.toc.href.split("/");
path.pop();
this.zip.readFile(this.spine.toc.href, (function(err, data){
if(err){
this.emit("error", new Error("Reading archive failed"));
return;
}
var xml = data.toString("utf-8"),
xmlparser = new XML2JS();
xmlparser.on("end", (function(result){
if(result.navMap && result.navMap.navPoint){
this.toc = this.walkNavMap(result.navMap.navPoint, path);
}
this.emit("end");
}).bind(this));
xmlparser.on("error", (function(err){
this.emit("error", new Error("Parsing container XML failed"));
return;
}).bind(this));
xmlparser.parseString(xml);
}).bind(this));
}
/**
* EPub#walkNavMap(branch, path[, level]) -> Array
* - branch (Array | Object): NCX NavPoint object
* - path (Array): Base path
* - level (Number): deepness
*
* Walks the NavMap object through all levels and finds elements
* for TOC
**/
EPub.prototype.walkNavMap = function(branch, path, level){
level || level || 0;
if(level>7)return [];
var output = [], element;
if(!Array.isArray(branch)){
branch = [branch];
}
for(var i=0, len = branch.length; i<len; i++){
if(branch[i]["navLabel"]){
element = {
id: (branch[i]["@"] && branch[i]["@"].id || "").trim(),
order: Number(branch[i]["@"] && branch[i]["@"].playOrder || 0),
title: (branch[i]["navLabel"] && branch[i]["navLabel"].text || branch[i]["navLabel"] || "").trim(),
href: (branch[i]["content"] && branch[i]["content"]["@"] && branch[i]["content"]["@"].src || "").trim()
}
if(element.href){
element.href = path.concat([element.href]).join("/");
output.push(element);
}
}
if(branch[i]["navPoint"]){
output = output.concat(this.walkNavMap(branch[i]["navPoint"], path, level+1));
}
}
return output;
}
/**
* EPub#getChapter(id, callback) -> undefined
* - id (String): Manifest id value for a chapter
* - callback (Function): callback function
*
* Finds a chapter text for an id. Replaces image and link URL's, removes
* <head> etc. elements
**/
EPub.prototype.getChapter = function(id, callback){
var path = this.rootFile.split("/"), keys = Object.keys(this.manifest);
path.pop();
if(this.manifest[id]){
this.zip.readFile(this.manifest[id].href, (function(err, data){
if(err){
callback(new Error("Reading archive failed"));
return;
}
var str = data.toString("utf-8");
// strip <body>
str.replace(/\n/g,"\u0000").replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, function(o,d){
str = d.replace(/\u0000/g,"\n").trim();
});
// replace images
str = str.replace(/(\ssrc\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function(o, a,b,c){
var img = path.concat([b]).join("/").trim(),
element;
for(var i=0, len=keys.length; i<len; i++){
if(this.manifest[keys[i]].href == img){
element = this.manifest[keys[i]];
break;
}
}
// include only images from manifest
if(element){
return a + this.imageroot + element.id+ "/" + img + c;
}else{
return "";
}
}).bind(this));
// replace links
str = str.replace(/(\shref\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function(o, a, b, c){
var linkparts = b && b.split("#"),
link = path.concat([(linkparts.shift() || "")]).join("/").trim(),
element;
console.log(link, linkparts.join("#"));
for(var i=0, len=keys.length; i<len; i++){
if(this.manifest[keys[i]].href.split("#")[0] == link){
element = this.manifest[keys[i]];
break;
}
}
if(linkparts.length){
link += "#" + linkparts.join("#");
}
// include only images from manifest
if(element){
return a + this.linkroot + element.id+ "/" + link + c;
}else{
return a + b + c;
}
}).bind(this));
callback(null, str);
}).bind(this));
}
}
var epub = new EPub("img.epub", "tere", "vana");
epub.on("error", function(err){
console.log("ERROR\n-----");
throw err;
});
epub.on("end", function(err){
console.log("PARSED\n-----");
console.log(epub.metadata);
console.log(epub.manifest);
console.log(epub.spine);
console.log(epub.toc);
epub.getChapter("item259", function(err, data){
console.log(err || data);
});
});
epub.parse();
/*
for(var i=0, len = this.zip.names.length; i<len; i++){
}
*/