Merge pull request #4 from julien-c/master

Add ability to get any file from the Epub + access raw HTML content
This commit is contained in:
Andris Reinman 2013-12-04 22:45:52 -08:00
commit e7fa8d455d
3 changed files with 133 additions and 79 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
node_modules
npm-debug.log
.DS_Store

View File

@ -46,6 +46,7 @@ Available fields:
* **language** Language code (*en* or *en-us* etc.)
* **subject** Topic of the book (*Fantasy*)
* **date** creation of the file (*2006-08-12*)
* **description**
## flow
@ -71,6 +72,10 @@ Load chapter text from the ebook.
...
epub.getChapter("chapter1", function(error, text){});
## getChapterRaw(chapter_id, callback)
Load raw chapter text from the ebook.
## getImage(image_id, callback)
Load image (as a Buffer value) from the ebook.
@ -79,3 +84,11 @@ Load image (as a Buffer value) from the ebook.
...
epub.getImage("image1", function(error, img, mimeType){});
## getFile(file_id, callback)
Load any file (as a Buffer value) from the ebook.
var epub = new EPub(...);
...
epub.getFile("css1", function(error, data, mimeType){});

196
epub.js
View File

@ -216,7 +216,7 @@ EPub.prototype.getRootFiles = function () {
/**
* EPub#handleRootFile() -> undefined
*
* Parser the rootfile XML and calls rootfile parser
* Parses the rootfile XML and calls rootfile parser
**/
EPub.prototype.handleRootFile = function () {
@ -527,13 +527,105 @@ EPub.prototype.walkNavMap = function (branch, path, id_list, level) {
* <head> etc. elements. Return only chapters with mime type application/xhtml+xml
**/
EPub.prototype.getChapter = function (id, callback) {
var i, len, path = this.rootFile.split("/"), keys = Object.keys(this.manifest);
path.pop();
this.getChapterRaw(id, (function (err, str) {
if (err) {
callback(err);
return;
}
var i, len, path = this.rootFile.split("/"), keys = Object.keys(this.manifest);
path.pop();
// remove linebreaks (no multi line matches in JS regex!)
str = str.replace(/\r?\n/g, "\u0000");
// keep only <body> contents
str.replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, function (o, d) {
str = d.trim();
});
// remove <script> blocks if any
str = str.replace(/<script[^>]*?>(.*?)<\/script[^>]*?>/ig, function (o, s) {
return "";
});
// remove <style> blocks if any
str = str.replace(/<style[^>]*?>(.*?)<\/style[^>]*?>/ig, function (o, s) {
return "";
});
// remove onEvent handlers
str = str.replace(/(\s)(on\w+)(\s*=\s*["']?[^"'\s>]*?["'\s>])/g, function (o, a, b, c) {
return a + "skip-" + b + c;
});
// replace images
str = str.replace(/(\ssrc\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function (o, a, b, c) {
var img = path.concat([b]).join("/").trim(),
element;
for (i = 0, len = keys.length; i < len; i++) {
if (this.manifest[keys[i]].href == img) {
element = this.manifest[keys[i]];
break;
}
}
// include only images from manifest
if (element) {
return a + this.imageroot + element.id + "/" + img + c;
} else {
return "";
}
}).bind(this));
// replace links
str = str.replace(/(\shref\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function (o, a, b, c) {
var linkparts = b && b.split("#"),
link = path.concat([(linkparts.shift() || "")]).join("/").trim(),
element;
for (i = 0, len = keys.length; i < len; i++) {
if (this.manifest[keys[i]].href.split("#")[0] == link) {
element = this.manifest[keys[i]];
break;
}
}
if (linkparts.length) {
link += "#" + linkparts.join("#");
}
// include only images from manifest
if (element) {
return a + this.linkroot + element.id + "/" + link + c;
} else {
return a + b + c;
}
}).bind(this));
// bring back linebreaks
str = str.replace(/\u0000/g, "\n").trim();
callback(null, str);
}).bind(this));
};
/**
* EPub#getChapterRaw(id, callback) -> undefined
* - id (String): Manifest id value for a chapter
* - callback (Function): callback function
*
* Returns the raw chapter text for an id.
**/
EPub.prototype.getChapterRaw = function (id, callback) {
if (this.manifest[id]) {
if ((this.manifest[id]['media-type'] || "").toLowerCase().trim() != "application/xhtml+xml") {
return callback(new Error("Inavlid mime type for chapter"));
return callback(new Error("Invalid mime type for chapter"));
}
this.zip.readFile(this.manifest[id].href, (function (err, data) {
@ -544,79 +636,6 @@ EPub.prototype.getChapter = function (id, callback) {
var str = data.toString("utf-8");
// remove linebreaks (no multi line matches in JS regex!)
str = str.replace(/\r?\n/g, "\u0000");
// keep only <body> contents
str.replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, function (o, d) {
str = d.trim();
});
// remove <script> blocks if any
str = str.replace(/<script[^>]*?>(.*?)<\/script[^>]*?>/ig, function (o, s) {
return "";
});
// remove <style> blocks if any
str = str.replace(/<style[^>]*?>(.*?)<\/style[^>]*?>/ig, function (o, s) {
return "";
});
// remove onEvent handlers
str = str.replace(/(\s)(on\w+)(\s*=\s*["']?[^"'\s>]*?["'\s>])/g, function (o, a, b, c) {
return a + "skip-" + b + c;
});
// replace images
str = str.replace(/(\ssrc\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function (o, a, b, c) {
var img = path.concat([b]).join("/").trim(),
element;
for (i = 0, len = keys.length; i < len; i++) {
if (this.manifest[keys[i]].href == img) {
element = this.manifest[keys[i]];
break;
}
}
// include only images from manifest
if (element) {
return a + this.imageroot + element.id + "/" + img + c;
} else {
return "";
}
}).bind(this));
// replace links
str = str.replace(/(\shref\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function (o, a, b, c) {
var linkparts = b && b.split("#"),
link = path.concat([(linkparts.shift() || "")]).join("/").trim(),
element;
for (i = 0, len = keys.length; i < len; i++) {
if (this.manifest[keys[i]].href.split("#")[0] == link) {
element = this.manifest[keys[i]];
break;
}
}
if (linkparts.length) {
link += "#" + linkparts.join("#");
}
// include only images from manifest
if (element) {
return a + this.linkroot + element.id + "/" + link + c;
} else {
return a + b + c;
}
}).bind(this));
// bring back linebreaks
str = str.replace(/\u0000/g, "\n").trim();
callback(null, str);
}).bind(this));
@ -631,7 +650,7 @@ EPub.prototype.getChapter = function (id, callback) {
* - id (String): Manifest id value for an image
* - callback (Function): callback function
*
* Finds an image an id. Returns the image as Buffer. Callback gets
* Finds an image for an id. Returns the image as Buffer. Callback gets
* an error object, image buffer and image content-type.
* Return only images with mime type image
**/
@ -639,9 +658,27 @@ EPub.prototype.getImage = function (id, callback) {
if (this.manifest[id]) {
if ((this.manifest[id]['media-type'] || "").toLowerCase().trim().substr(0, 6) != "image/") {
return callback(new Error("Inavlid mime type for image"));
return callback(new Error("Invalid mime type for image"));
}
this.getFile(id, callback);
} else {
callback(new Error("File not found"));
}
};
/**
* EPub#getFile(id, callback) -> undefined
* - id (String): Manifest id value for a file
* - callback (Function): callback function
*
* Finds a file for an id. Returns the file as Buffer. Callback gets
* an error object, file contents buffer and file content-type.
**/
EPub.prototype.getFile = function (id, callback) {
if (this.manifest[id]) {
this.zip.readFile(this.manifest[id].href, (function (err, data) {
if (err) {
callback(new Error("Reading archive failed"));
@ -655,5 +692,6 @@ EPub.prototype.getImage = function (id, callback) {
}
};
// Expose to the world
module.exports = EPub;