From 0124977a37ff170eaca1114aa9cfeb2f32a4916b Mon Sep 17 00:00:00 2001 From: alexadam Date: Wed, 24 Aug 2016 14:56:05 +0300 Subject: [PATCH] add support for base64 imgs; don't execute extract scripts multiple times on the same tab --- web-extension/extractHtml.js | 63 ++++++++++++++++++++++-------------- web-extension/menu.js | 59 +++++++++++++++++++-------------- web-extension/utils.js | 42 ++++++++++++++++++++++-- 3 files changed, 111 insertions(+), 53 deletions(-) diff --git a/web-extension/extractHtml.js b/web-extension/extractHtml.js index deeae7e..2a36e3e 100644 --- a/web-extension/extractHtml.js +++ b/web-extension/extractHtml.js @@ -1,5 +1,5 @@ -var allImgSrc = {}; var allImages = []; +var extractedImages = []; var maxNrOfElements = 10000; ////// @@ -7,8 +7,23 @@ function getImageSrc(srcTxt) { if (!srcTxt) { return ''; } - allImgSrc[srcTxt] = 'img-' + (Math.floor(Math.random()*1000000)) + '.' + getFileExtension(srcTxt); - return '../images/' + allImgSrc[srcTxt]; + var isB64Img = isBase64Img(srcTxt); + var fileExtension = getFileExtension(srcTxt); + var newImgFileName = 'img-' + (Math.floor(Math.random()*1000000*Math.random()*100000)) + '.' + fileExtension; + + if (isB64Img) { + extractedImages.push({ + filename: newImgFileName, // TODO name + data: getBase64ImgData(srcTxt) + }); + } else { + allImages.push({ + originalUrl: getImgDownloadUrl(srcTxt), + filename: newImgFileName, // TODO name + }); + } + + return '../images/' + newImgFileName; } function generateRandomTag() { @@ -98,11 +113,11 @@ function force(contentString) { var $content = $(contentString); $content.find('img').each(function (index, elem) { - $(elem).replaceWith('' + tagOpen + 'img src="' + getImageSrc($(elem).attr('src')) + '"' + tagClose + tagOpen + '/img' + tagClose + ''); + $(elem).replaceWith('' + tagOpen + 'img src="' + getImageSrc($(elem).attr('src').trim()) + '"' + tagClose + tagOpen + '/img' + tagClose + ''); }); $content.find('a').each(function (index, elem) { - $(elem).replaceWith('' + tagOpen + 'a href="' + getHref($(elem).attr('href')) + '"' + tagClose + $(elem).html() + tagOpen + '/a' + tagClose + ''); + $(elem).replaceWith('' + tagOpen + 'a href="' + getHref($(elem).attr('href').trim()) + '"' + tagClose + $(elem).html() + tagOpen + '/a' + tagClose + ''); }); if ($('*').length < maxNrOfElements) { @@ -144,14 +159,15 @@ function force(contentString) { // https://github.com/blowsie/Pure-JavaScript-HTML5-Parser function sanitize(rawContentString) { - allImgSrc = {}; + allImages = []; + extractedImages = []; var srcTxt = ''; var dirty = null; try { // dirty = getHtmlAsString(rawContent); - wdirty = $.parseHTML(rawContentString); + var wdirty = $.parseHTML(rawContentString); $wdirty = $(wdirty); - $wdirty.find('script, style, svg, canvas, noscript').remove(); + $wdirty.find('script, style, svg, canvas, noscript').remove(); // TODO remove iframes $wdirty.find('*:empty').not('img').remove(); dirty = '
' + $wdirty.html() + '
'; @@ -190,14 +206,14 @@ function sanitize(rawContentString) { tattrs = attrs.filter(function(attr) { return attr.name === 'src'; }).map(function(attr) { - return getImageSrc(attr.escaped); + return getImageSrc(decodeHtmlEntity(attr.value).trim()); }); lastFragment = tattrs.length === 0 ? '' : ''; } else if (tag === 'a') { tattrs = attrs.filter(function(attr) { return attr.name === 'href'; }).map(function(attr) { - return getHref(attr.escaped); + return getHref(decodeHtmlEntity(attr.value).trim()); }); lastFragment = tattrs.length === 0 ? '' : ''; } else { @@ -282,18 +298,18 @@ function getSelectedNodes() { ///// -function deferredAddZip(url, filename, zip) { +function deferredAddZip(url, filename) { var deferred = $.Deferred(); JSZipUtils.getBinaryContent(url, function(err, data) { if (err) { // deferred.reject(err); TODO + console.log('Error:', err); deferred.resolve(); } else { - var tmpImg = { + extractedImages.push({ filename: filename, data: base64ArrayBuffer(data) - }; - allImages.push(tmpImg); + }); deferred.resolve(); } }); @@ -301,10 +317,7 @@ function deferredAddZip(url, filename, zip) { } chrome.runtime.onMessage.addListener(function(request, sender, sendResponse) { - console.log('Extract Html...'); var imgsPromises = []; - allImgSrc = {}; - allImages = []; var result = {}; var pageSrc = ''; var tmpContent = ''; @@ -317,19 +330,19 @@ chrome.runtime.onMessage.addListener(function(request, sender, sendResponse) { pageSrc.forEach(function (page) { tmpContent += getContent(page); }); + } else if (request.type === 'echo') { + sendResponse({ + echo: true + }); + return; } if (tmpContent.trim() === '') { return; } - Object.keys(allImgSrc).forEach(function(imgSrc, index) { - try { - var tmpDeffered = deferredAddZip(getImgDownloadUrl(imgSrc), allImgSrc[imgSrc]); - imgsPromises.push(tmpDeffered); - } catch (e) { - console.log('Error:', e); - } + allImages.forEach(function (tmpImg) { + imgsPromises.push(deferredAddZip(tmpImg.originalUrl, tmpImg.filename)); }); $.when.apply($, imgsPromises).done(function() { @@ -337,7 +350,7 @@ chrome.runtime.onMessage.addListener(function(request, sender, sendResponse) { url: getPageUrl(document.title), title: getPageTitle(document.title), baseUrl: getCurrentUrl(), - images: allImages, + images: extractedImages, content: tmpContent }; sendResponse(result); diff --git a/web-extension/menu.js b/web-extension/menu.js index 3c845ac..f76f8ae 100644 --- a/web-extension/menu.js +++ b/web-extension/menu.js @@ -25,8 +25,6 @@ document.getElementById("editChapters").onclick = function() { window.close(); }); - - }; function dispatch(action, justAddToBuffer) { @@ -37,34 +35,45 @@ function dispatch(action, justAddToBuffer) { currentWindow: true, active: true }, function(tab) { + chrome.tabs.sendMessage(tab[0].id, { + type: 'echo' + }, function(response) { + if (!response) { + chrome.tabs.executeScript(tab[0].id, {file: '/jquery.js'}); + chrome.tabs.executeScript(tab[0].id, {file: '/utils.js'}); + chrome.tabs.executeScript(tab[0].id, {file: '/filesaver.js'}); + chrome.tabs.executeScript(tab[0].id, {file: '/jszip.js'}); + chrome.tabs.executeScript(tab[0].id, {file: '/jszip-utils.js'}); + chrome.tabs.executeScript(tab[0].id, {file: '/pure-parser.js'}); - chrome.tabs.executeScript(tab[0].id, {file: '/jquery.js'}); - chrome.tabs.executeScript(tab[0].id, {file: '/utils.js'}); - chrome.tabs.executeScript(tab[0].id, {file: '/filesaver.js'}); - chrome.tabs.executeScript(tab[0].id, {file: '/jszip.js'}); - chrome.tabs.executeScript(tab[0].id, {file: '/jszip-utils.js'}); - chrome.tabs.executeScript(tab[0].id, {file: '/pure-parser.js'}); - - chrome.tabs.executeScript(tab[0].id, { - file: 'extractHtml.js' - }, function() { - chrome.tabs.sendMessage(tab[0].id, { - type: action - }, function(response) { - if (!justAddToBuffer) { - buildEbook([response]); - } else { - getEbookPages(function (allPages) { - allPages.push(response); - saveEbookPages(allPages); - window.close(); - }); - } - }); + chrome.tabs.executeScript(tab[0].id, { + file: 'extractHtml.js' + }, function() { + sendMessage(tab[0].id, action, justAddToBuffer); + }); + } else if (response.echo) { + sendMessage(tab[0].id, action, justAddToBuffer); + } }); }); } +function sendMessage(tabId, action, justAddToBuffer) { + chrome.tabs.sendMessage(tabId, { + type: action + }, function(response) { + if (!justAddToBuffer) { + buildEbook([response]); + } else { + getEbookPages(function (allPages) { + allPages.push(response); + saveEbookPages(allPages); + window.close(); + }); + } + }); +} + document.getElementById('savePage').onclick = function() { dispatch('extract-page', false); }; diff --git a/web-extension/utils.js b/web-extension/utils.js index 2f55685..3c84202 100644 --- a/web-extension/utils.js +++ b/web-extension/utils.js @@ -39,7 +39,14 @@ function getOriginUrl() { function getFileExtension(fileName) { try { - var tmpFileName = fileName.split('.').pop(); + var tmpFileName = ''; + + if (isBase64Img(fileName)) { + tmpFileName = getBase64ImgType(fileName); + } else { + tmpFileName = fileName.split('.').pop(); + } + if (tmpFileName.indexOf('?') > 0) { tmpFileName = tmpFileName.split('?')[0]; } @@ -47,12 +54,12 @@ function getFileExtension(fileName) { if (tmpFileName === 'jpg') { tmpFileName = 'jpeg'; } else if (tmpFileName.trim() === '') { - return 'jpeg'; //TODO + return ''; } return tmpFileName; } catch (e) { console.log('Error:', e); - return 'jpeg'; //TODO + return ''; } } @@ -121,3 +128,32 @@ function base64ArrayBuffer(arrayBuffer) { return base64; } + +// http://stackoverflow.com/questions/7394748/whats-the-right-way-to-decode-a-string-that-has-special-html-entities-in-it +function decodeHtmlEntity(str) { + return str.replace(/&#(\d+);/g, function(match, dec) { + return String.fromCharCode(dec); + }); +} + +function isBase64Img(srcTxt) { + return srcTxt.indexOf('data:image/') === 0 && srcTxt.indexOf(';base64,') > 0; +} + +function getBase64ImgType(srcTxt) { + try { + return srcTxt.split(';')[0].split('/')[1]; + } catch (e) { + console.log('Error:', e); + return ''; + } +} + +function getBase64ImgData(srcTxt) { + try { + return srcTxt.split(';base64,')[1]; + } catch (e) { + console.log('Error:', e); + return ''; + } +}