#ifndef __HTML_OPTIMIZER #define __HTML_OPTIMIZER #undef do_open #undef do_close #include #include #include #include #include #include #include using namespace std; #define MAX_BREAKS 2 // #define MIN(a, b) ((a) > (b) ? (a) : (b)) #define fast_cmp2(a, b) (*(const short *)(a) == *(const short *)(b)) #define fast_cmp4(a, b) ((const int *)(a) == (const int *)(b)) #define MATH_MIN(a, b) ((a) < (b) ? (a) : (b)) #define MATH_MAX(a, b) ((a) > (b) ? (a) : (b)) #define IS_SPACE(c) (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v') #define VALIDATE_TAG_NAME(c) (isalpha(c) || c == '_' || c == '-' || c == ':' || isdigit(c)) #define D(msg, args...) printf(msg, ##args) #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) #define CHECK_LINK(url, offset, end) \ ( \ (offset + 7 < end && strncasecmp((url) + offset, "://", 3) == 0) || \ (offset + 14 < end && strncasecmp((url) + offset, "http://", 7) == 0) || \ (offset + 15 < end && strncasecmp((url) + offset, "https://", 8) == 0) \ ) #define IS_HTML_SPACE(text, offset, end) \ ( \ text[offset] == '&' && offset + 6 < end && strncasecmp(&text[offset + 1], "nbsp;", 5) == 0 \ ) #define EXTERNAL_LINK_REDIRECT "/?sid=::sid::;redirect=" #define const_str_len(str) str, (sizeof(str) - 1) #define const_len_str(str) (sizeof(str) - 1), str #define const_obj_size(obj) str, sizeof(obj) #define const_size_obj(obj) sizeof(obj), str #define NEW_TAG(tag, _id, start_open, end_open, start_close, end_close) \ (tag).id = _id; \ (tag).open_start = start_open; \ (tag).open_end = end_open; \ (tag).close_start = start_close; \ (tag).close_end = end_close; #define NEW_REPLACE(tag, _id, _start, _end) \ (tag).id = _id; \ (tag).start = _start; \ (tag).end = _end; #define NEW_TAG_META #define new_vector_item(v) v.at(({v.resize(v.size() + 1); (v.size() - 1); })) #if 0 #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #define mem_realloc(p, s) saferealloc(p, s) #define mem_alloc(s) safemalloc(s) #else #define mem_realloc(p, s) realloc(p, s) #define mem_alloc(s) malloc(s) #endif static const struct { const char *domain; unsigned char length; } allowed_domains[] = {{const_str_len("spaces.ru")}, {const_str_len("space.me")}}; static const char hex_digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; enum { R_NONE = 0, R_SPACE, R_BREAK, R_HTML_WRAP, R_IMG_TAG, R_EXTERNAL_LINK, R_INTERNAL_LINK, }; struct TagNameList { char name[11]; unsigned char len; }; enum { TAG_ALLOW_SELF_CLOSURE = 1 << 0, TAG_IS_BLOCK = 1 << 1, TAG_SAVE_ATTRS = 1 << 2, TAG_IS_BREAK = 1 << 3, TAG_REPLACE_INNERS = 1 << 4, TAG_IS_ALLOWED = 1 << 5, TAG_SKIP = 1 << 6, TAG_ALT = 1 << 7, }; typedef struct HtmlTags { const char *name; unsigned char len; unsigned char flags; } HtmlTags; struct ReplaceTag { unsigned char id; unsigned int start; unsigned int end; }; typedef struct t_str { char *value; unsigned int length; unsigned int allocated; } t_str; #define NULL_TAG_ID ((unsigned char) - 1) enum { TAG_UNKNOWN = 0, TAG_S, TAG_U, TAG_B, TAG_I, TAG_IMG, TAG_A, TAG_BR, TAG_HEAD, TAG_STYLE, TAG_TITLE, TAG_SCRIPT, TAG_TEXTAREA, TAG_NOSTRIPT, TAG_HTML, TAG_TD, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_BLOCKQUOTE, TAG_CANVAS, TAG_DD, TAG_DIV, TAG_DL, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEADER, TAG_HGROUP, TAG_HR, TAG_OL, TAG_OUTPUT, TAG_P, TAG_PRE, TAG_SECTION, TAG_TABLE, TAG_TFOOT, TAG_TBODY, TAG_THEAD, TAG_UL, TAG_VIDEO, TAG_LI, TAG_TR, TAG_XBODY, TAG_XHTML, }; static const HtmlTags html_tags[] = { {NULL, 0, TAG_SKIP}, {const_str_len("s"), TAG_IS_ALLOWED}, {const_str_len("u"), TAG_IS_ALLOWED}, {const_str_len("b"), TAG_IS_ALLOWED}, {const_str_len("i"), TAG_IS_ALLOWED}, {const_str_len("img"), TAG_ALT | TAG_ALLOW_SELF_CLOSURE}, {const_str_len("a"), TAG_SAVE_ATTRS | TAG_IS_ALLOWED}, {const_str_len("br"), TAG_IS_BREAK | TAG_ALLOW_SELF_CLOSURE}, {const_str_len("head"), TAG_REPLACE_INNERS}, {const_str_len("style"), TAG_REPLACE_INNERS}, {const_str_len("title"), TAG_REPLACE_INNERS}, {const_str_len("script"), TAG_REPLACE_INNERS}, {const_str_len("textarea"), TAG_REPLACE_INNERS}, {const_str_len("noscript"), TAG_IS_BLOCK}, {const_str_len("html"), TAG_SKIP}, {const_str_len("td"), TAG_SKIP}, {const_str_len("address"), TAG_IS_BLOCK}, {const_str_len("article"), TAG_IS_BLOCK}, {const_str_len("aside"), TAG_IS_BLOCK}, {const_str_len("audio"), TAG_IS_BLOCK}, {const_str_len("blockquote"), TAG_IS_BLOCK}, {const_str_len("canvas"), TAG_IS_BLOCK}, {const_str_len("dd"), TAG_IS_BLOCK}, {const_str_len("div"), TAG_IS_BLOCK}, {const_str_len("dl"), TAG_IS_BLOCK}, {const_str_len("fieldset"), TAG_IS_BLOCK}, {const_str_len("figcaption"), TAG_IS_BLOCK}, {const_str_len("figure"), TAG_IS_BLOCK}, {const_str_len("footer"), TAG_IS_BLOCK}, {const_str_len("form"), TAG_IS_BLOCK}, {const_str_len("h1"), TAG_IS_BLOCK}, {const_str_len("h2"), TAG_IS_BLOCK}, {const_str_len("h3"), TAG_IS_BLOCK}, {const_str_len("h4"), TAG_IS_BLOCK}, {const_str_len("h5"), TAG_IS_BLOCK}, {const_str_len("h6"), TAG_IS_BLOCK}, {const_str_len("header"), TAG_IS_BLOCK}, {const_str_len("hgroup"), TAG_IS_BLOCK}, {const_str_len("hr"), TAG_IS_BLOCK}, {const_str_len("ol"), TAG_IS_BLOCK}, {const_str_len("output"), TAG_IS_BLOCK}, {const_str_len("p"), TAG_IS_BLOCK}, {const_str_len("pre"), TAG_IS_BLOCK}, {const_str_len("section"), TAG_IS_BLOCK}, {const_str_len("table"), TAG_IS_BLOCK}, {const_str_len("tfoot"), TAG_IS_BLOCK}, {const_str_len("tbody"), TAG_IS_BLOCK}, {const_str_len("thead"), TAG_IS_BLOCK}, {const_str_len("ul"), TAG_IS_BLOCK}, {const_str_len("video"), TAG_IS_BLOCK}, {const_str_len("li"), TAG_IS_BLOCK}, {const_str_len("tr"), TAG_IS_BLOCK}, {const_str_len("xbody"), TAG_SKIP}, {const_str_len("xhtml"), TAG_SKIP}, }; enum { FLAGS_CLEAR_SPACES = 1, FLAGS_DEBUG = 2 }; enum { TOK_UNK = 0, TOK_TAG, TOK_COMMENT, TOK_DOCTYPE, TOK_CDATA, TOK_COMMENT2, }; extern "C" t_str optimize_html(char *text_val, unsigned int text_len, unsigned int flags = 0, bool dup = false); #endif