php error_reporting E_ALL class xml_corrector public _allowed_nodes ar

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
<?php
error_reporting(E_ALL);
class xml_corrector {
public $_allowed_nodes = array();
public $_allowed_attributes = array();
function correct_node($node) {
switch ($node->nodeType) {
case XML_ELEMENT_NODE:
if(in_array($node->nodeName, $this->_allowed_nodes) ) {
foreach ($node->attributes as $attrName => $attrNode) {
if(
!isset ($this->_allowed_attributes[$node->nodeName][$attrName]) ||
!preg_match($this->_allowed_attributes[$node->nodeName][$attrName], $attrNode->nodeValue)
) {
return false;
}
}
if($node->hasChildNodes()) {
$nodes_for_remove = array();
for($i=0;$i<$node->childNodes->length; $i++) {
if(true !== $this->correct_node($node->childNodes->item($i)) ) {
$nodes_for_remove[] = $node->childNodes->item($i);
}
}
foreach ($nodes_for_remove as $child) {
$node->removeChild($child);
}
}
return true;
}else{
return false;
}
break;
case XML_TEXT_NODE:
return true;
break;
}
return false;
}
function get_HTML($input) {
libxml_use_internal_errors(true);
$xml = DOMDocument::loadHTML($input);
$xml->formatOutput = true;
libxml_use_internal_errors(false);
if(false){$xml = new DOMDocument();}
$root = $xml->getElementsByTagName('body')->item(0);
$nodes_for_remove = array();
if($root->hasChildNodes()) {
for($i=0;$i<$root->childNodes->length; $i++) {
if(true !== $this->correct_node($root->childNodes->item($i)) ) {
$nodes_for_remove[] = $root->childNodes->item($i);
}
}
}
foreach ($nodes_for_remove as $child) {
$root->removeChild($child);
}
$newdoc = new DOMDocument;
$newdoc->formatOutput = false;
$node = $newdoc->importNode($root, true);
$newdoc->appendChild($node);
return $newdoc->saveXML();
}
}
$t = new xml_corrector();
$t->_allowed_nodes = array('a', 'b', 'div');
$t->_allowed_attributes = array(
'a' => array(
'href' => '/^[a-zA-Z\.\-_0-9:\/]+$/'
),
'b' => array(
'title' => '/^[a-zA-Z0-9_\-\ ]+$/'
)
);
var_dump($t->get_HTML('<t:b>tnamespace</t:b><b title="test">correct</b><b title="test" help="fff">test</b><v>!!!</v><script>var test=\'ffff\';</script><a href="ttt">link</a><b>af<p>para</p><div>111</div>'));