include hr and br html tags when extracting data

This commit is contained in:
Alex Adam 2017-10-11 21:19:24 +03:00
parent 0671fca24f
commit 11604a8a56

View file

@ -92,7 +92,7 @@ function preProcess($htmlObject) {
extractCanvasToImg($htmlObject); extractCanvasToImg($htmlObject);
extractSvgToImg($htmlObject); extractSvgToImg($htmlObject);
$htmlObject.find('script, style, noscript, iframe').remove(); $htmlObject.find('script, style, noscript, iframe').remove();
$htmlObject.find('*:empty').not('img').remove(); $htmlObject.find('*:empty').not('img').not('br').not('hr').remove();
formatPreCodeElements($htmlObject); formatPreCodeElements($htmlObject);
} }
@ -212,6 +212,14 @@ function sanitize(rawContentString) {
} }
} }
lastFragment = tmpAttrsTxt.length === 0 ? '<a>' : '<a ' + tmpAttrsTxt + '>'; lastFragment = tmpAttrsTxt.length === 0 ? '<a>' : '<a ' + tmpAttrsTxt + '>';
} else if (tag === 'br' || tag === 'hr') {
var tmpAttrsTxt = '';
for (var i = 0; i < attrs.length; i++) {
if (attrs[i].name === 'data-class') {
tmpAttrsTxt += ' class="' + attrs[i].value + '"';
}
}
lastFragment = '<' + tag + ' ' + tmpAttrsTxt + '></' + tag + '>';
} else { } else {
var tmpAttrsTxt = ''; var tmpAttrsTxt = '';
for (var i = 0; i < attrs.length; i++) { for (var i = 0; i < attrs.length; i++) {
@ -226,7 +234,7 @@ function sanitize(rawContentString) {
lastFragment = ''; lastFragment = '';
}, },
end: function(tag) { end: function(tag) {
if (allowedTags.indexOf(tag) < 0 || tag === 'img') { if (allowedTags.indexOf(tag) < 0 || tag === 'img' || tag === 'br' || tag === 'hr') {
return; return;
} }