html replace update (skip script blocks etc.)

This commit is contained in:
andris9 2011-06-12 23:56:13 +03:00
parent fd06139fcb
commit e0f3931a19
2 changed files with 36 additions and 4 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
*.epub
test.*

37
epub.js
View File

@ -23,6 +23,16 @@ module.exports = EPub;
* });
* epub.on("error", function(error){ ... });
* epub.parse();
*
* Image and link URL format is:
*
* imageroot + img_id + img_zip_path
*
* So an image "logo.jpg" which resides in "OPT/" in the zip archive
* and is listed in the manifest with id "logo_img" will have the
* following url (providing that imageroot is "/images/"):
*
* /images/logo_img/OPT/logo.jpg
**/
function EPub(fname, imageroot, linkroot){
EventEmitter.call(this);
@ -493,9 +503,27 @@ EPub.prototype.getChapter = function(id, callback){
var str = data.toString("utf-8");
// strip <body>
str.replace(/\n/g,"\u0000").replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, function(o,d){
str = d.replace(/\u0000/g,"\n").trim();
// remove linebreaks (no multi line matches in JS regex!)
str = str.replace(/\r?\n/g,"\u0000");
// keep only <body> contents
str.replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, function(o,d){
str = d.trim();
});
// remove <script> blocks if any
str = str.replace(/<script[^>]*?>(.*?)<\/script[^>]*?>/ig, function(o, s){
return "";
});
// remove <style> blocks if any
str = str.replace(/<style[^>]*?>(.*?)<\/style[^>]*?>/ig, function(o, s){
return "";
});
// remove onEvent handlers
str = str.replace(/(\s)(on\w+)(\s*=\s*["']?[^"'\s>]*?["'\s>])/g, function(o, a,b,c){
return a + "skip-" + b + c;
});
// replace images
@ -545,6 +573,9 @@ EPub.prototype.getChapter = function(id, callback){
}).bind(this));
// bring back linebreaks
str = str.replace(/\u0000/g,"\n").trim();
callback(null, str);
}).bind(this));