Merge pull request #4 from julien-c/master

Add ability to get any file from the Epub + access raw HTML content
2013-12-04 22:45:52 -08:00
parent 60e1311bda 95141a7775
commit e7fa8d455d
3 changed files with 133 additions and 79 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 node_modules
 npm-debug.log
 .DS_Store
--- a/README.md
+++ b/README.md
@@ -46,6 +46,7 @@ Available fields:
  * **language** Language code (*en* or *en-us* etc.)
  * **subject** Topic of the book (*Fantasy*)
  * **date** creation of the file (*2006-08-12*)
  * **description**
 ## flow
@@ -71,6 +72,10 @@ Load chapter text from the ebook.
    ...
    epub.getChapter("chapter1", function(error, text){});
 ## getChapterRaw(chapter_id, callback)
 Load raw chapter text from the ebook.
 ## getImage(image_id, callback)
 Load image (as a Buffer value) from the ebook.
@@ -79,3 +84,11 @@ Load image (as a Buffer value) from the ebook.
    ...
    epub.getImage("image1", function(error, img, mimeType){});
 ## getFile(file_id, callback)
 Load any file (as a Buffer value) from the ebook.
    var epub = new EPub(...);
    ...
    epub.getFile("css1", function(error, data, mimeType){});
--- a/epub.js
+++ b/epub.js
@@ -216,7 +216,7 @@ EPub.prototype.getRootFiles = function () {
 /**
 *  EPub#handleRootFile() -> undefined
 *
- *  Parser the rootfile XML and calls rootfile parser
+ *  Parses the rootfile XML and calls rootfile parser
 **/
 EPub.prototype.handleRootFile = function () {
@@ -527,13 +527,105 @@ EPub.prototype.walkNavMap = function (branch, path, id_list, level) {
 *  <head> etc. elements. Return only chapters with mime type application/xhtml+xml
 **/
 EPub.prototype.getChapter = function (id, callback) {
-    var i, len, path = this.rootFile.split("/"), keys = Object.keys(this.manifest);
+    this.getChapterRaw(id, (function (err, str) {
-    path.pop();
+        if (err) {
            callback(err);
            return;
        }
        var i, len, path = this.rootFile.split("/"), keys = Object.keys(this.manifest);
        path.pop();
        // remove linebreaks (no multi line matches in JS regex!)
        str = str.replace(/\r?\n/g, "\u0000");
        // keep only <body> contents
        str.replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, function (o, d) {
            str = d.trim();
        });
        // remove <script> blocks if any
        str = str.replace(/<script[^>]*?>(.*?)<\/script[^>]*?>/ig, function (o, s) {
            return "";
        });
        // remove <style> blocks if any
        str = str.replace(/<style[^>]*?>(.*?)<\/style[^>]*?>/ig, function (o, s) {
            return "";
        });
        // remove onEvent handlers
        str = str.replace(/(\s)(on\w+)(\s*=\s*["']?[^"'\s>]*?["'\s>])/g, function (o, a, b, c) {
            return a + "skip-" + b + c;
        });
        // replace images
        str = str.replace(/(\ssrc\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function (o, a, b, c) {
            var img = path.concat([b]).join("/").trim(),
                element;
            for (i = 0, len = keys.length; i < len; i++) {
                if (this.manifest[keys[i]].href == img) {
                    element = this.manifest[keys[i]];
                    break;
                }
            }
            // include only images from manifest
            if (element) {
                return a + this.imageroot + element.id + "/" + img + c;
            } else {
                return "";
            }
        }).bind(this));
        // replace links
        str = str.replace(/(\shref\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function (o, a, b, c) {
            var linkparts = b && b.split("#"),
                link = path.concat([(linkparts.shift() || "")]).join("/").trim(),
                element;
            for (i = 0, len = keys.length; i < len; i++) {
                if (this.manifest[keys[i]].href.split("#")[0] == link) {
                    element = this.manifest[keys[i]];
                    break;
                }
            }
            if (linkparts.length) {
                link  +=  "#" + linkparts.join("#");
            }
            // include only images from manifest
            if (element) {
                return a + this.linkroot + element.id + "/" + link + c;
            } else {
                return a + b + c;
            }
        }).bind(this));
        // bring back linebreaks
        str = str.replace(/\u0000/g, "\n").trim();
        callback(null, str);
    }).bind(this));
 };
 /**
 *  EPub#getChapterRaw(id, callback) -> undefined
 *  - id (String): Manifest id value for a chapter
 *  - callback (Function): callback function
 *
 *  Returns the raw chapter text for an id.
 **/
 EPub.prototype.getChapterRaw = function (id, callback) {
    if (this.manifest[id]) {
        if ((this.manifest[id]['media-type'] || "").toLowerCase().trim()  !=  "application/xhtml+xml") {
-            return callback(new Error("Inavlid mime type for chapter"));
+            return callback(new Error("Invalid mime type for chapter"));
        }
        this.zip.readFile(this.manifest[id].href, (function (err, data) {
@@ -544,79 +636,6 @@ EPub.prototype.getChapter = function (id, callback) {
            var str = data.toString("utf-8");
            // remove linebreaks (no multi line matches in JS regex!)
            str = str.replace(/\r?\n/g, "\u0000");
            // keep only <body> contents
            str.replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, function (o, d) {
                str = d.trim();
            });
            // remove <script> blocks if any
            str = str.replace(/<script[^>]*?>(.*?)<\/script[^>]*?>/ig, function (o, s) {
                return "";
            });
            // remove <style> blocks if any
            str = str.replace(/<style[^>]*?>(.*?)<\/style[^>]*?>/ig, function (o, s) {
                return "";
            });
            // remove onEvent handlers
            str = str.replace(/(\s)(on\w+)(\s*=\s*["']?[^"'\s>]*?["'\s>])/g, function (o, a, b, c) {
                return a + "skip-" + b + c;
            });
            // replace images
            str = str.replace(/(\ssrc\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function (o, a, b, c) {
                var img = path.concat([b]).join("/").trim(),
                    element;
                for (i = 0, len = keys.length; i < len; i++) {
                    if (this.manifest[keys[i]].href == img) {
                        element = this.manifest[keys[i]];
                        break;
                    }
                }
                // include only images from manifest
                if (element) {
                    return a + this.imageroot + element.id + "/" + img + c;
                } else {
                    return "";
                }
            }).bind(this));
            // replace links
            str = str.replace(/(\shref\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (function (o, a, b, c) {
                var linkparts = b && b.split("#"),
                    link = path.concat([(linkparts.shift() || "")]).join("/").trim(),
                    element;
                for (i = 0, len = keys.length; i < len; i++) {
                    if (this.manifest[keys[i]].href.split("#")[0] == link) {
                        element = this.manifest[keys[i]];
                        break;
                    }
                }
                if (linkparts.length) {
                    link  +=  "#" + linkparts.join("#");
                }
                // include only images from manifest
                if (element) {
                    return a + this.linkroot + element.id + "/" + link + c;
                } else {
                    return a + b + c;
                }
            }).bind(this));
            // bring back linebreaks
            str = str.replace(/\u0000/g, "\n").trim();
            callback(null, str);
        }).bind(this));
@@ -631,7 +650,7 @@ EPub.prototype.getChapter = function (id, callback) {
 *  - id (String): Manifest id value for an image
 *  - callback (Function): callback function
 *
- *  Finds an image an id. Returns the image as Buffer. Callback gets
+ *  Finds an image for an id. Returns the image as Buffer. Callback gets
 *  an error object, image buffer and image content-type.
 *  Return only images with mime type image
 **/
@@ -639,9 +658,27 @@ EPub.prototype.getImage = function (id, callback) {
    if (this.manifest[id]) {
        if ((this.manifest[id]['media-type'] || "").toLowerCase().trim().substr(0, 6)  !=  "image/") {
-            return callback(new Error("Inavlid mime type for image"));
+            return callback(new Error("Invalid mime type for image"));
        }
        this.getFile(id, callback);
    } else {
        callback(new Error("File not found"));
    }
 };
 /**
 *  EPub#getFile(id, callback) -> undefined
 *  - id (String): Manifest id value for a file
 *  - callback (Function): callback function
 *
 *  Finds a file for an id. Returns the file as Buffer. Callback gets
 *  an error object, file contents buffer and file content-type.
 **/
 EPub.prototype.getFile = function (id, callback) {
    if (this.manifest[id]) {
        this.zip.readFile(this.manifest[id].href, (function (err, data) {
            if (err) {
                callback(new Error("Reading archive failed"));
@@ -655,5 +692,6 @@ EPub.prototype.getImage = function (id, callback) {
    }
 };
 // Expose to the world
 module.exports = EPub;