misc updates WIP

This commit is contained in:
Alex Adam 2020-02-21 15:15:11 +02:00
parent c39c62b8ec
commit 5d1ff43012
2 changed files with 53 additions and 120 deletions

View file

@ -1,6 +1,5 @@
var allImages = []; var allImages = [];
var extractedImages = []; var extractedImages = [];
var maxNrOfElements = 20000;
var allowedTags = [ var allowedTags = [
'address', 'article', 'aside', 'footer', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'address', 'article', 'aside', 'footer', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'hgroup', 'nav', 'section', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure', 'hr', 'li', 'hgroup', 'nav', 'section', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure', 'hr', 'li',
@ -22,12 +21,13 @@ var tmpIdsToNewCss = {};
// src: https://idpf.github.io/a11y-guidelines/content/style/reference.html // src: https://idpf.github.io/a11y-guidelines/content/style/reference.html
var supportedCss = [ var supportedCss = [
'background-color', 'background-color',
'border', 'border-top', 'border-right', 'border-bottom', 'border-left', 'border',
'color', 'font', 'font-size', 'font-weight', 'font-family', 'color',
'font',
'letter-spacing', 'line-height', 'letter-spacing', 'line-height',
'list-style', 'outline', 'list-style',
'padding', 'quotes', 'padding', 'quotes',
'text-decoration', 'text-transform', 'word-spacing', 'text-decoration', 'text-transform', 'text-align', 'word-spacing',
]; ];
////// //////
@ -40,6 +40,8 @@ function getImageSrc(srcTxt) {
return ''; return '';
} }
// TODO - convert <imgs> with svg sources to jpeg
var fileExtension = getFileExtension(srcTxt); var fileExtension = getFileExtension(srcTxt);
if (fileExtension === '') { if (fileExtension === '') {
return ''; return '';
@ -62,15 +64,6 @@ function getImageSrc(srcTxt) {
return '../images/' + newImgFileName; return '../images/' + newImgFileName;
} }
function formatPreCodeElements($jQueryElement) {
$jQueryElement.find('pre').each(function (i, pre) {
$(pre).replaceWith('<pre>' + pre.innerText + '</pre>');
});
$jQueryElement.find('code').each(function (i, pre) {
$(pre).replaceWith('<code>' + pre.innerText + '</code>');
});
}
// tested // tested
function extractMathMl($htmlObject) { function extractMathMl($htmlObject) {
$htmlObject.find('span[id^="MathJax-Element-"]').each(function (i, el) { $htmlObject.find('span[id^="MathJax-Element-"]').each(function (i, el) {
@ -82,11 +75,12 @@ function extractMathMl($htmlObject) {
// TODO // TODO
function extractCanvasToImg($htmlObject) { function extractCanvasToImg($htmlObject) {
$htmlObject.find('canvas').each(function (index, elem) { $htmlObject.find('canvas').each(function (index, elem) {
var tmpXP = getXPath(elem); try {
tmpXP = tmpXP.replace(/^\/div\[1\]/m, '/html[1]/body[1]'); var imgUrl = docEl.toDataURL('image/jpeg');
var docEl = lookupElementByXPath(tmpXP); $(elem).replaceWith('<img src="' + imgUrl + '" alt=""></img>');
var jpegUrl = docEl.toDataURL('image/png'); } catch (e) {
$(elem).replaceWith('<img src="' + jpegUrl + '" alt=""></img>'); console.log(e)
}
}); });
} }
@ -105,97 +99,30 @@ function extractSvgToImg($htmlObject) {
} }
function preProcess($htmlObject) { function preProcess($htmlObject) {
// TODO
// $htmlObject.find('script, style, noscript, iframe').remove();
// $('body').find('script, style, noscript, iframe').remove()
// $('body').find('script, style, noscript, iframe').contents().remove()
// $('body').find('iframe').remove()
// $('body').find('*:empty').not('img').not('br').not('hr').remove();
// formatPreCodeElements($('body'));
extractMathMl($htmlObject); extractMathMl($htmlObject);
extractCanvasToImg($htmlObject); extractCanvasToImg($htmlObject);
extractSvgToImg($htmlObject); extractSvgToImg($htmlObject);
$htmlObject.find('script, style, noscript, iframe').remove();
$htmlObject.find('*:empty').not('img').not('br').not('hr').remove();
formatPreCodeElements($htmlObject);
} }
function force($content, withError) { function parseHTML(rawContentString) {
try {
var tagOpen = '@@@' + generateRandomTag();
var tagClose = '###' + generateRandomTag();
var startEl = '<object>';
var endEl = '</object>';
if (withError) {
$content = $($content);
preProcess($content);
}
$content.find('img').each(function (index, elem) {
var $elem = $(elem);
var imgSrc = getImageSrc($elem.attr('src'));
if (imgSrc === '') {
$elem.replaceWith('');
} else {
var className = $elem.attr('data-class');
$elem.replaceWith(startEl + tagOpen + 'img src="' + imgSrc + '" class="' + className + '"' + tagClose + tagOpen + '/img' + tagClose + endEl);
}
});
$content.find('a').each(function (index, elem) {
var $elem = $(elem);
var aHref = getHref($elem.attr('href'));
if (aHref === '') {
$elem.replaceWith('');
} else {
var className = $elem.attr('data-class');
$elem.replaceWith(startEl + tagOpen + 'a href="' + aHref + '" class="' + className + '"' + tagClose + $(elem).html() + tagOpen + '/a' + tagClose + endEl);
}
});
all($content);
function all($startElement) {
var tagName = $startElement.get(0).tagName.toLowerCase();
if (allowedTags.indexOf(tagName) >= 0) {
var children = $startElement.children();
var childrenLen = children.length;
while (childrenLen--) {
all($(children[childrenLen]));
}
var className = $startElement.attr('data-class');
$startElement.replaceWith(startEl + tagOpen + tagName + ' class="' + className + '"' + tagClose + $startElement.html() + tagOpen + '/' + tagName + tagClose + endEl);
}
}
var contentString = $content.text();
var tagOpenRegex = new RegExp(tagOpen, 'gi');
var tagCloseRegex = new RegExp(tagClose, 'gi');
contentString = contentString.replace(tagOpenRegex, '<');
contentString = contentString.replace(tagCloseRegex, '>');
contentString = contentString.replace(/&nbsp;/gi, '&#160;');
// getHref() replace does not work (&amp; is overwritten)
contentString = escapeXMLChars(contentString);
return contentString;
} catch (e) {
console.log('Error:', e);
return '';
}
}
function sanitize(rawContentString) {
allImages = []; allImages = [];
extractedImages = []; extractedImages = [];
var srcTxt = '';
var dirty = null; var dirty = null;
try { try {
var wdirty = $.parseHTML(rawContentString); $wdirty = $(rawContentString);
$wdirty = $(wdirty);
preProcess($wdirty); preProcess($wdirty);
if ($('*').length > maxNrOfElements) { dirty = $wdirty.html();
return force($wdirty, false);
}
dirty = '<div>' + $wdirty.html() + '</div>';
var results = ''; var results = '';
var lastFragment = ''; var lastFragment = '';
@ -223,7 +150,7 @@ function sanitize(rawContentString) {
// ignore imgs without source // ignore imgs without source
lastFragment = '' lastFragment = ''
} else { } else {
lastFragment = tmpAttrsTxt.length === 0 ? '<img></img>' : '<img ' + tmpAttrsTxt + ' alt=""></img>'; lastFragment = tmpAttrsTxt.length === 0 ? '<img></img>' : '<img' + tmpAttrsTxt + ' alt=""></img>';
} }
} else if (tag === 'a') { } else if (tag === 'a') {
var tmpAttrsTxt = ''; var tmpAttrsTxt = '';
@ -234,7 +161,7 @@ function sanitize(rawContentString) {
tmpAttrsTxt += ' class="' + attrs[i].value + '"'; tmpAttrsTxt += ' class="' + attrs[i].value + '"';
} }
} }
lastFragment = tmpAttrsTxt.length === 0 ? '<a>' : '<a ' + tmpAttrsTxt + '>'; lastFragment = tmpAttrsTxt.length === 0 ? '<a>' : '<a' + tmpAttrsTxt + '>';
} else if (tag === 'br' || tag === 'hr') { } else if (tag === 'br' || tag === 'hr') {
var tmpAttrsTxt = ''; var tmpAttrsTxt = '';
for (var i = 0; i < attrs.length; i++) { for (var i = 0; i < attrs.length; i++) {
@ -242,7 +169,7 @@ function sanitize(rawContentString) {
tmpAttrsTxt += ' class="' + attrs[i].value + '"'; tmpAttrsTxt += ' class="' + attrs[i].value + '"';
} }
} }
lastFragment = '<' + tag + ' ' + tmpAttrsTxt + '></' + tag + '>'; lastFragment = '<' + tag + tmpAttrsTxt + '></' + tag + '>';
} else if (tag === 'math') { } else if (tag === 'math') {
var tmpAttrsTxt = ''; var tmpAttrsTxt = '';
tmpAttrsTxt += ' xmlns="http://www.w3.org/1998/Math/MathML"'; tmpAttrsTxt += ' xmlns="http://www.w3.org/1998/Math/MathML"';
@ -251,7 +178,7 @@ function sanitize(rawContentString) {
tmpAttrsTxt += ' alttext="' + attrs[i].value + '"'; tmpAttrsTxt += ' alttext="' + attrs[i].value + '"';
} }
} }
lastFragment = '<' + tag + ' ' + tmpAttrsTxt + '>'; lastFragment = '<' + tag + tmpAttrsTxt + '>';
} else { } else {
var tmpAttrsTxt = ''; var tmpAttrsTxt = '';
for (var i = 0; i < attrs.length; i++) { for (var i = 0; i < attrs.length; i++) {
@ -259,7 +186,7 @@ function sanitize(rawContentString) {
tmpAttrsTxt += ' class="' + attrs[i].value + '"'; tmpAttrsTxt += ' class="' + attrs[i].value + '"';
} }
} }
lastFragment = '<' + tag + ' ' + tmpAttrsTxt + '>'; lastFragment = '<' + tag + tmpAttrsTxt + '>';
} }
results += lastFragment; results += lastFragment;
@ -270,7 +197,7 @@ function sanitize(rawContentString) {
return; return;
} }
results += "</" + tag + ">\n"; results += "</" + tag + ">";
}, },
chars: function(text) { chars: function(text) {
if (lastTag !== '' && allowedTags.indexOf(lastTag) < 0) { if (lastTag !== '' && allowedTags.indexOf(lastTag) < 0) {
@ -298,27 +225,16 @@ function getContent(htmlContent) {
try { try {
var tmp = document.createElement('div'); var tmp = document.createElement('div');
tmp.appendChild(htmlContent.cloneNode(true)); tmp.appendChild(htmlContent.cloneNode(true));
var dirty = '<div>' + tmp.innerHTML + '</div>'; var tmpHtml = '<div>' + tmp.innerHTML + '</div>';
return sanitize(dirty); return parseHTML(tmpHtml);
} catch (e) { } catch (e) {
console.log('Error:', e); console.log('Error:', e);
return ''; return htmlContent;
} }
} }
///// /////
function getPageUrl(url) {
return url.toLowerCase().replace(/\s+/g,'_').replace(/[^a-z0-9_]/g,'') + Math.floor(Math.random() * 10000) + '.xhtml';
}
function getPageTitle(title) {
if (title.trim().length === 0) {
return 'ebook';
}
return title;
}
function getSelectedNodes() { function getSelectedNodes() {
// if (document.selection) { // if (document.selection) {
// return document.selection.createRange().parentElement(); // return document.selection.createRange().parentElement();
@ -372,7 +288,9 @@ function extractCss(includeStyle, appliedStyles) {
let tmpName = cssClassesToTmpIds[classNames]; let tmpName = cssClassesToTmpIds[classNames];
let tmpNewCss = tmpIdsToNewCss[tmpName]; let tmpNewCss = tmpIdsToNewCss[tmpName];
if (!tmpName) { if (!tmpName) {
tmpName = 'class-' + Math.floor(Math.random()*100000); // TODO - collision between class names when multiple pages
// rename 'class-' to 'c'
tmpName = 'c' + Math.floor(Math.random()*100000);
cssClassesToTmpIds[classNames] = tmpName; cssClassesToTmpIds[classNames] = tmpName;
} }
if (!tmpNewCss) { if (!tmpNewCss) {
@ -381,7 +299,11 @@ function extractCss(includeStyle, appliedStyles) {
for (let cssTagName of supportedCss) { for (let cssTagName of supportedCss) {
let cssValue = $pre.css(cssTagName); let cssValue = $pre.css(cssTagName);
if (cssValue && cssValue.length > 0) { if (cssValue && cssValue.length > 0) {
// TODO - optimisation IF no css value, skip it (smaller css output file)
// create a filter function based on css-tag and skip values
// if (cssValue.indexOf('none') === -1 || cssValue.indexOf('0px') === -1) {
tmpNewCss[cssTagName] = cssValue; tmpNewCss[cssTagName] = cssValue;
// }
} }
} }
tmpIdsToNewCss[tmpName] = tmpNewCss; tmpIdsToNewCss[tmpName] = tmpNewCss;

View file

@ -340,3 +340,14 @@ function getEbookFileName(name) {
.replace(/&quot;/ig, '') .replace(/&quot;/ig, '')
.replace(/&apos;/ig, ''); .replace(/&apos;/ig, '');
} }
function getPageUrl(url) {
return url.toLowerCase().replace(/\s+/g,'_').replace(/[^a-z0-9_]/g,'') + Math.floor(Math.random() * 10000) + '.xhtml';
}
function getPageTitle(title) {
if (title.trim().length === 0) {
return 'ebook';
}
return title;
}