add support for base64 imgs; don't execute extract scripts multiple times on the same tab

This commit is contained in:
alexadam 2016-08-24 14:56:05 +03:00
parent cc68e99a94
commit 0124977a37
3 changed files with 111 additions and 53 deletions

View file

@ -1,5 +1,5 @@
var allImgSrc = {};
var allImages = []; var allImages = [];
var extractedImages = [];
var maxNrOfElements = 10000; var maxNrOfElements = 10000;
////// //////
@ -7,8 +7,23 @@ function getImageSrc(srcTxt) {
if (!srcTxt) { if (!srcTxt) {
return ''; return '';
} }
allImgSrc[srcTxt] = 'img-' + (Math.floor(Math.random()*1000000)) + '.' + getFileExtension(srcTxt); var isB64Img = isBase64Img(srcTxt);
return '../images/' + allImgSrc[srcTxt]; var fileExtension = getFileExtension(srcTxt);
var newImgFileName = 'img-' + (Math.floor(Math.random()*1000000*Math.random()*100000)) + '.' + fileExtension;
if (isB64Img) {
extractedImages.push({
filename: newImgFileName, // TODO name
data: getBase64ImgData(srcTxt)
});
} else {
allImages.push({
originalUrl: getImgDownloadUrl(srcTxt),
filename: newImgFileName, // TODO name
});
}
return '../images/' + newImgFileName;
} }
function generateRandomTag() { function generateRandomTag() {
@ -98,11 +113,11 @@ function force(contentString) {
var $content = $(contentString); var $content = $(contentString);
$content.find('img').each(function (index, elem) { $content.find('img').each(function (index, elem) {
$(elem).replaceWith('<span>' + tagOpen + 'img src="' + getImageSrc($(elem).attr('src')) + '"' + tagClose + tagOpen + '/img' + tagClose + '</span>'); $(elem).replaceWith('<span>' + tagOpen + 'img src="' + getImageSrc($(elem).attr('src').trim()) + '"' + tagClose + tagOpen + '/img' + tagClose + '</span>');
}); });
$content.find('a').each(function (index, elem) { $content.find('a').each(function (index, elem) {
$(elem).replaceWith('<span>' + tagOpen + 'a href="' + getHref($(elem).attr('href')) + '"' + tagClose + $(elem).html() + tagOpen + '/a' + tagClose + '</span>'); $(elem).replaceWith('<span>' + tagOpen + 'a href="' + getHref($(elem).attr('href').trim()) + '"' + tagClose + $(elem).html() + tagOpen + '/a' + tagClose + '</span>');
}); });
if ($('*').length < maxNrOfElements) { if ($('*').length < maxNrOfElements) {
@ -144,14 +159,15 @@ function force(contentString) {
// https://github.com/blowsie/Pure-JavaScript-HTML5-Parser // https://github.com/blowsie/Pure-JavaScript-HTML5-Parser
function sanitize(rawContentString) { function sanitize(rawContentString) {
allImgSrc = {}; allImages = [];
extractedImages = [];
var srcTxt = ''; var srcTxt = '';
var dirty = null; var dirty = null;
try { try {
// dirty = getHtmlAsString(rawContent); // dirty = getHtmlAsString(rawContent);
wdirty = $.parseHTML(rawContentString); var wdirty = $.parseHTML(rawContentString);
$wdirty = $(wdirty); $wdirty = $(wdirty);
$wdirty.find('script, style, svg, canvas, noscript').remove(); $wdirty.find('script, style, svg, canvas, noscript').remove(); // TODO remove iframes
$wdirty.find('*:empty').not('img').remove(); $wdirty.find('*:empty').not('img').remove();
dirty = '<div>' + $wdirty.html() + '</div>'; dirty = '<div>' + $wdirty.html() + '</div>';
@ -190,14 +206,14 @@ function sanitize(rawContentString) {
tattrs = attrs.filter(function(attr) { tattrs = attrs.filter(function(attr) {
return attr.name === 'src'; return attr.name === 'src';
}).map(function(attr) { }).map(function(attr) {
return getImageSrc(attr.escaped); return getImageSrc(decodeHtmlEntity(attr.value).trim());
}); });
lastFragment = tattrs.length === 0 ? '<img></img>' : '<img src="' + tattrs[0] + '" alt=""></img>'; lastFragment = tattrs.length === 0 ? '<img></img>' : '<img src="' + tattrs[0] + '" alt=""></img>';
} else if (tag === 'a') { } else if (tag === 'a') {
tattrs = attrs.filter(function(attr) { tattrs = attrs.filter(function(attr) {
return attr.name === 'href'; return attr.name === 'href';
}).map(function(attr) { }).map(function(attr) {
return getHref(attr.escaped); return getHref(decodeHtmlEntity(attr.value).trim());
}); });
lastFragment = tattrs.length === 0 ? '<a>' : '<a href="' + tattrs[0] + '">'; lastFragment = tattrs.length === 0 ? '<a>' : '<a href="' + tattrs[0] + '">';
} else { } else {
@ -282,18 +298,18 @@ function getSelectedNodes() {
///// /////
function deferredAddZip(url, filename, zip) { function deferredAddZip(url, filename) {
var deferred = $.Deferred(); var deferred = $.Deferred();
JSZipUtils.getBinaryContent(url, function(err, data) { JSZipUtils.getBinaryContent(url, function(err, data) {
if (err) { if (err) {
// deferred.reject(err); TODO // deferred.reject(err); TODO
console.log('Error:', err);
deferred.resolve(); deferred.resolve();
} else { } else {
var tmpImg = { extractedImages.push({
filename: filename, filename: filename,
data: base64ArrayBuffer(data) data: base64ArrayBuffer(data)
}; });
allImages.push(tmpImg);
deferred.resolve(); deferred.resolve();
} }
}); });
@ -301,10 +317,7 @@ function deferredAddZip(url, filename, zip) {
} }
chrome.runtime.onMessage.addListener(function(request, sender, sendResponse) { chrome.runtime.onMessage.addListener(function(request, sender, sendResponse) {
console.log('Extract Html...');
var imgsPromises = []; var imgsPromises = [];
allImgSrc = {};
allImages = [];
var result = {}; var result = {};
var pageSrc = ''; var pageSrc = '';
var tmpContent = ''; var tmpContent = '';
@ -317,19 +330,19 @@ chrome.runtime.onMessage.addListener(function(request, sender, sendResponse) {
pageSrc.forEach(function (page) { pageSrc.forEach(function (page) {
tmpContent += getContent(page); tmpContent += getContent(page);
}); });
} else if (request.type === 'echo') {
sendResponse({
echo: true
});
return;
} }
if (tmpContent.trim() === '') { if (tmpContent.trim() === '') {
return; return;
} }
Object.keys(allImgSrc).forEach(function(imgSrc, index) { allImages.forEach(function (tmpImg) {
try { imgsPromises.push(deferredAddZip(tmpImg.originalUrl, tmpImg.filename));
var tmpDeffered = deferredAddZip(getImgDownloadUrl(imgSrc), allImgSrc[imgSrc]);
imgsPromises.push(tmpDeffered);
} catch (e) {
console.log('Error:', e);
}
}); });
$.when.apply($, imgsPromises).done(function() { $.when.apply($, imgsPromises).done(function() {
@ -337,7 +350,7 @@ chrome.runtime.onMessage.addListener(function(request, sender, sendResponse) {
url: getPageUrl(document.title), url: getPageUrl(document.title),
title: getPageTitle(document.title), title: getPageTitle(document.title),
baseUrl: getCurrentUrl(), baseUrl: getCurrentUrl(),
images: allImages, images: extractedImages,
content: tmpContent content: tmpContent
}; };
sendResponse(result); sendResponse(result);

View file

@ -25,8 +25,6 @@ document.getElementById("editChapters").onclick = function() {
window.close(); window.close();
}); });
}; };
function dispatch(action, justAddToBuffer) { function dispatch(action, justAddToBuffer) {
@ -37,7 +35,10 @@ function dispatch(action, justAddToBuffer) {
currentWindow: true, currentWindow: true,
active: true active: true
}, function(tab) { }, function(tab) {
chrome.tabs.sendMessage(tab[0].id, {
type: 'echo'
}, function(response) {
if (!response) {
chrome.tabs.executeScript(tab[0].id, {file: '/jquery.js'}); chrome.tabs.executeScript(tab[0].id, {file: '/jquery.js'});
chrome.tabs.executeScript(tab[0].id, {file: '/utils.js'}); chrome.tabs.executeScript(tab[0].id, {file: '/utils.js'});
chrome.tabs.executeScript(tab[0].id, {file: '/filesaver.js'}); chrome.tabs.executeScript(tab[0].id, {file: '/filesaver.js'});
@ -48,7 +49,17 @@ function dispatch(action, justAddToBuffer) {
chrome.tabs.executeScript(tab[0].id, { chrome.tabs.executeScript(tab[0].id, {
file: 'extractHtml.js' file: 'extractHtml.js'
}, function() { }, function() {
chrome.tabs.sendMessage(tab[0].id, { sendMessage(tab[0].id, action, justAddToBuffer);
});
} else if (response.echo) {
sendMessage(tab[0].id, action, justAddToBuffer);
}
});
});
}
function sendMessage(tabId, action, justAddToBuffer) {
chrome.tabs.sendMessage(tabId, {
type: action type: action
}, function(response) { }, function(response) {
if (!justAddToBuffer) { if (!justAddToBuffer) {
@ -61,8 +72,6 @@ function dispatch(action, justAddToBuffer) {
}); });
} }
}); });
});
});
} }
document.getElementById('savePage').onclick = function() { document.getElementById('savePage').onclick = function() {

View file

@ -39,7 +39,14 @@ function getOriginUrl() {
function getFileExtension(fileName) { function getFileExtension(fileName) {
try { try {
var tmpFileName = fileName.split('.').pop(); var tmpFileName = '';
if (isBase64Img(fileName)) {
tmpFileName = getBase64ImgType(fileName);
} else {
tmpFileName = fileName.split('.').pop();
}
if (tmpFileName.indexOf('?') > 0) { if (tmpFileName.indexOf('?') > 0) {
tmpFileName = tmpFileName.split('?')[0]; tmpFileName = tmpFileName.split('?')[0];
} }
@ -47,12 +54,12 @@ function getFileExtension(fileName) {
if (tmpFileName === 'jpg') { if (tmpFileName === 'jpg') {
tmpFileName = 'jpeg'; tmpFileName = 'jpeg';
} else if (tmpFileName.trim() === '') { } else if (tmpFileName.trim() === '') {
return 'jpeg'; //TODO return '';
} }
return tmpFileName; return tmpFileName;
} catch (e) { } catch (e) {
console.log('Error:', e); console.log('Error:', e);
return 'jpeg'; //TODO return '';
} }
} }
@ -121,3 +128,32 @@ function base64ArrayBuffer(arrayBuffer) {
return base64; return base64;
} }
// http://stackoverflow.com/questions/7394748/whats-the-right-way-to-decode-a-string-that-has-special-html-entities-in-it
function decodeHtmlEntity(str) {
return str.replace(/&#(\d+);/g, function(match, dec) {
return String.fromCharCode(dec);
});
}
function isBase64Img(srcTxt) {
return srcTxt.indexOf('data:image/') === 0 && srcTxt.indexOf(';base64,') > 0;
}
function getBase64ImgType(srcTxt) {
try {
return srcTxt.split(';')[0].split('/')[1];
} catch (e) {
console.log('Error:', e);
return '';
}
}
function getBase64ImgData(srcTxt) {
try {
return srcTxt.split(';base64,')[1];
} catch (e) {
console.log('Error:', e);
return '';
}
}