<?php
class StripTags
{
const STYLE_COLOR_PATTERN = '\#(?:[\da-f]{3}){1,2}';
protected static $uri_attrs = array(
'a' => array('href'),
'area' => array('href'),
'link' => array('href'),
'img' => array('src', 'longdesc', 'usemap'),
'object' => array('classid', 'codebase', 'data', 'usemap'),
'applet' => array('codebase'),
'q' => array('cite'),
'blockquote' => array('cite'),
'form' => array('action'),
'input' => array('src', 'usemap'),
'frame' => array('longdesc', 'src'),
'input' => array('src'),
'iframe' => array('longdesc', 'src'),
'script' => array('src', 'for'),
'embed' => array('pluginpage', 'src'),
/*'head' => array('profile'),
'body' => array('background'),
'base' => array('href'),*/
);
protected $tags_attributes;
protected $valid_tags_str;
protected $remove_js_href;
protected $_el2styles;
protected $clean_uri_xpath;
protected $check_uri_attrs;
function __construct($valid_tags = array(), $el2styles = array(), $remove_js_href = true)
{
if(is_string($valid_tags))
$valid_tags = self :: parseValidTagsString($valid_tags);
$this->tags_attributes = array();
foreach($valid_tags as $tag => $attributes)
$this->tags_attributes[$tag] = array_map('strtolower', array_map('trim', $attributes));
$this->valid_tags_str = '<' . implode('><', array_keys($this->tags_attributes)) . '>';
$this->remove_js_href = $remove_js_href;
$this->_el2styles = self :: compileEl2Style($el2styles);
$this->check_uri_attrs = array();
$clean_uri_xpath = array();
foreach(self :: $uri_attrs as $tag => $attributes)
{
if(!isset($this->tags_attributes[$tag]) || !count($attributes = array_intersect($attributes, $this->tags_attributes[$tag])))
continue;
$this->check_uri_attrs[$tag] = $attributes;
$clean_uri_xpath[] = '//'.$tag.'[@'.implode(' or @', $attributes).']';
}
$this->clean_uri_xpath = implode(' | ', $clean_uri_xpath);
}
function getValidTags()
{
return $this->tags_attributes;
}
function process($text)
{
$text = trim(strip_tags($text, $this->valid_tags_str));
if(!$text)
return '';
$text = '<head><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>'.str_replace(array("\r\n", "\r"), "\n", $text);
$document = new DOMDocument('1.0', 'UTF-8');
@$document->loadHTML($text);
$this->cleanNode($document->documentElement);
$this->cleanUseXPath($document);
$result = $document->saveHTML();
$result = substr($result, stripos($result, '<body>') + 6);
$result = substr($result, 0, strripos($result, '</body>'));
return $result;
}
static function getHttpHost()
{
return (isset($_SERVER['HTTP_HOST']) && $_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : 'localhost';
}
function cleanUseXPath($document)
{
$xpather = new DOMXPath($document);
$el2style = $this->_el2styles;
foreach($xpather->query('//*[@style]') as $entry)
{
if(!isset($el2style[$entry->tagName]))
{
$entry->removeAttribute('style');
continue;
}
$styles = array();
if(preg_match_all($el2style[$entry->tagName], ';'.$entry->getAttribute('style').';', $styles) && isset($styles[0]))
$entry->setAttribute('style', implode(' ', $styles[0]));
else
$entry->removeAttribute('style');
}
$host = 'http://'.self :: getHttpHost();
foreach($xpather->query('//a[@href]') as $entry)
{
if(strpos($entry->getAttribute('href'), $host) === false)
{
if(($target = $entry->getAttributeNode('target')))
$target->value = '_blank';
else
{
$target = $document->createAttribute('target');
$target->value = '_blank';
$entry->appendChild($target);
}
}
}
$allow_schemes = array('http', 'https', 'ftp', 'mailto');
foreach($xpather->query($this->clean_uri_xpath) as $entry)
{
foreach($this->check_uri_attrs[$entry->tagName] as $atribute)
{
$uri = strtolower(trim($entry->getAttribute($atribute)));
if(!strlen($uri) || $uri{0} == '#' || $uri{0} == '/' || $uri{0} == '.')
$entry->setAttribute($atribute, $uri);
else
{
$vars = parse_url($uri);
if(isset($vars['scheme']) && in_array($vars['scheme'], $allow_schemes))
$entry->setAttribute($atribute, $uri);
else
$entry->setAttribute($atribute, 'http://'.$uri);
}
}
}
}
function cleanNode($node)
{
if($node->attributes && array_key_exists($node->nodeName, $this->tags_attributes))
{
$valid_tags = $this->tags_attributes[$node->nodeName];
$remove_attributes = array();
foreach($node->attributes as $atribute)
{
if(!in_array($atribute->name, $valid_tags))
$remove_attributes[] = $atribute->name;
}
foreach($remove_attributes as $atribute)
$node->removeAttribute($atribute);
}
if($node->childNodes)
foreach($node->childNodes as $child)
$this->cleanNode($child);
}
// $string 'cut,hr[class|width|size|noshade],ap[href]'
static function parseValidTagsString($string)
{
$valid_tags = array();
foreach(explode(',', $string) as $str_tag)
{
if(!preg_match('/^([^\[]+)(\[(.+)\])?$/', $str_tag, $matches))
continue;
$valid_tags[trim($matches[1])] = isset($matches[3]) ? explode('|', $matches[3]) : array();
}
return $valid_tags;
}
static function compileEl2Style($conf)
{
$_el2styles = array();
foreach($conf as $el => $props)
{
$match = array();
foreach($props as $name => $values)
{
if(is_array($values))
$values = implode('|', array_map('preg_quote', $values));
$match[] = '(?:[\s;]{1}'.preg_quote($name).'\s*:\s*(?:'.$values.')[\s;]{1})';
}
$_el2styles[$el] = '~(?:'.implode('|', $match).')~i';
}
return $_el2styles;
}
}