var allImages = []; var extractedImages = []; var maxNrOfElements = 20000; var allowedTags = [ 'address', 'article', 'aside', 'footer', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', 'nav', 'section', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure', 'hr', 'li', 'main', 'ol', 'p', 'pre', 'ul', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', 'code', 'data', 'dfn', 'em', 'i', 'img', 'kbd', 'mark', 'q', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'del', 'ins', 'caption', 'col', 'colgroup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'math', 'maction', 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mlabeledtr', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', 'mroot', 'mrow', 'ms', 'mspace', 'msqrt', 'mstyle', 'msub', 'msup', 'msubsup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'msgroup', 'mlongdiv', 'mscarries', 'mscarry', 'mstack' ]; ////// function getImageSrc(srcTxt) { if (!srcTxt) { return ''; } srcTxt = srcTxt.trim(); if (srcTxt === '') { return ''; } var isB64Img = isBase64Img(srcTxt); var fileExtension = getFileExtension(srcTxt); var newImgFileName = 'img-' + (Math.floor(Math.random()*1000000*Math.random()*100000)) + '.' + fileExtension; if (isB64Img) { extractedImages.push({ filename: newImgFileName, // TODO name data: getBase64ImgData(srcTxt) }); } else { allImages.push({ originalUrl: getImgDownloadUrl(srcTxt), filename: newImgFileName, // TODO name }); } return '../images/' + newImgFileName; } function generateRandomTag() { var text = ""; var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; for(var i = 0; i < 5; i++) text += possible.charAt(Math.floor(Math.random() * possible.length)); return text; } function formatPreCodeElements($jQueryElement) { $jQueryElement.find('pre').each(function (i, pre) { $(pre).replaceWith('
' + pre.innerText + ''); }); $jQueryElement.find('code').each(function (i, pre) { $(pre).replaceWith('
' + pre.innerText + '
');
});
}
function preProcess($htmlObject) {
$htmlObject.find('script[type="math/mml"]').each(function (i, el) {
$(el).replaceWith('' + el.innerHTML + '');
});
$htmlObject.find('script, style, svg, canvas, noscript, iframe').remove();
$htmlObject.find('*:empty').not('img').remove();
$htmlObject.find('*[class^="mjx-chtml"]').remove(); // MathJax formatting
formatPreCodeElements($htmlObject);
}
function force($content, withError) {
try {
var tagOpen = '@@@' + generateRandomTag();
var tagClose = '###' + generateRandomTag();
var startEl = '';
if (withError) {
$content = $($content);
preProcess($content);
}
$content.find('img').each(function (index, elem) {
var imgSrc = getImageSrc($(elem).attr('src'));
if (imgSrc === '') {
$(elem).replaceWith('');
} else {
$(elem).replaceWith(startEl + tagOpen + 'img src="' + imgSrc + '"' + tagClose + tagOpen + '/img' + tagClose + endEl);
}
});
$content.find('a').each(function (index, elem) {
var aHref = getHref($(elem).attr('href'));
if (aHref === '') {
$(elem).replaceWith('');
} else {
$(elem).replaceWith(startEl + tagOpen + 'a href="' + aHref + '"' + tagClose + $(elem).html() + tagOpen + '/a' + tagClose + endEl);
}
});
if ($('*').length < maxNrOfElements) {
allowedTags.forEach(function (tagName) {
var tmpElems = $content.find(tagName);
while (tmpElems.length > 0) {
$tmpElem = $(tmpElems[0]);
$tmpElem.replaceWith(startEl + tagOpen + tagName + tagClose + $tmpElem.html() + tagOpen + '/' + tagName + tagClose + endEl);
tmpElems = $content.find(tagName);
}
});
}
var contentString = $content.text();
var tagOpenRegex = new RegExp(tagOpen, 'gi');
var tagCloseRegex = new RegExp(tagClose, 'gi');
contentString = contentString.replace(tagOpenRegex, '<');
contentString = contentString.replace(tagCloseRegex, '>');
contentString = contentString.replace(/ /gi, ' ');
return contentString;
} catch (e) {
console.log('Error:', e);
return '';
}
}
function sanitize(rawContentString) {
allImages = [];
extractedImages = [];
var srcTxt = '';
var dirty = null;
try {
var wdirty = $.parseHTML(rawContentString);
$wdirty = $(wdirty);
preProcess($wdirty);
if ($('*').length > maxNrOfElements) {
return force($wdirty, false);
}
dirty = '