#ifndef __HTML_OPTIMIZER
#define __HTML_OPTIMIZER
#undef do_open
#undef do_close
#include
#include
#include
#include
#include
#include
#include
using namespace std;
#define MAX_BREAKS 2
// #define MIN(a, b) ((a) > (b) ? (a) : (b))
#define fast_cmp2(a, b) (*(const short *)(a) == *(const short *)(b))
#define fast_cmp4(a, b) ((const int *)(a) == (const int *)(b))
#define MATH_MIN(a, b) ((a) < (b) ? (a) : (b))
#define MATH_MAX(a, b) ((a) > (b) ? (a) : (b))
#define IS_SPACE(c) (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v')
#define VALIDATE_TAG_NAME(c) (isalpha(c) || c == '_' || c == '-' || c == ':' || isdigit(c))
#define D(msg, args...) printf(msg, ##args)
#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
#define CHECK_LINK(url, offset, end) \
( \
(offset + 7 < end && strncasecmp((url) + offset, "://", 3) == 0) || \
(offset + 14 < end && strncasecmp((url) + offset, "http://", 7) == 0) || \
(offset + 15 < end && strncasecmp((url) + offset, "https://", 8) == 0) \
)
#define IS_HTML_SPACE(text, offset, end) \
( \
text[offset] == '&' && offset + 6 < end && strncasecmp(&text[offset + 1], "nbsp;", 5) == 0 \
)
#define EXTERNAL_LINK_REDIRECT "/?sid=::sid::;redirect="
#define const_str_len(str) str, (sizeof(str) - 1)
#define const_len_str(str) (sizeof(str) - 1), str
#define const_obj_size(obj) str, sizeof(obj)
#define const_size_obj(obj) sizeof(obj), str
#define NEW_TAG(tag, _id, start_open, end_open, start_close, end_close) \
(tag).id = _id; \
(tag).open_start = start_open; \
(tag).open_end = end_open; \
(tag).close_start = start_close; \
(tag).close_end = end_close;
#define NEW_REPLACE(tag, _id, _start, _end) \
(tag).id = _id; \
(tag).start = _start; \
(tag).end = _end;
#define NEW_TAG_META
#define new_vector_item(v) v.at(({v.resize(v.size() + 1); (v.size() - 1); }))
#if 0
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#define mem_realloc(p, s) saferealloc(p, s)
#define mem_alloc(s) safemalloc(s)
#else
#define mem_realloc(p, s) realloc(p, s)
#define mem_alloc(s) malloc(s)
#endif
static const struct {
const char *domain;
unsigned char length;
} allowed_domains[] = {{const_str_len("spaces.ru")}, {const_str_len("space.me")}};
static const char hex_digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
enum {
R_NONE = 0,
R_SPACE,
R_BREAK,
R_HTML_WRAP,
R_IMG_TAG,
R_EXTERNAL_LINK,
R_INTERNAL_LINK,
};
struct TagNameList {
char name[11];
unsigned char len;
};
enum {
TAG_ALLOW_SELF_CLOSURE = 1 << 0,
TAG_IS_BLOCK = 1 << 1,
TAG_SAVE_ATTRS = 1 << 2,
TAG_IS_BREAK = 1 << 3,
TAG_REPLACE_INNERS = 1 << 4,
TAG_IS_ALLOWED = 1 << 5,
TAG_SKIP = 1 << 6,
TAG_ALT = 1 << 7,
};
typedef struct HtmlTags {
const char *name;
unsigned char len;
unsigned char flags;
} HtmlTags;
struct ReplaceTag {
unsigned char id;
unsigned int start;
unsigned int end;
};
typedef struct t_str {
char *value;
unsigned int length;
unsigned int allocated;
} t_str;
#define NULL_TAG_ID ((unsigned char) - 1)
enum {
TAG_UNKNOWN = 0,
TAG_S,
TAG_U,
TAG_B,
TAG_I,
TAG_IMG,
TAG_A,
TAG_BR,
TAG_HEAD,
TAG_STYLE,
TAG_TITLE,
TAG_SCRIPT,
TAG_TEXTAREA,
TAG_NOSTRIPT,
TAG_HTML,
TAG_TD,
TAG_ADDRESS,
TAG_ARTICLE,
TAG_ASIDE,
TAG_AUDIO,
TAG_BLOCKQUOTE,
TAG_CANVAS,
TAG_DD,
TAG_DIV,
TAG_DL,
TAG_FIELDSET,
TAG_FIGCAPTION,
TAG_FIGURE,
TAG_FOOTER,
TAG_FORM,
TAG_H1,
TAG_H2,
TAG_H3,
TAG_H4,
TAG_H5,
TAG_H6,
TAG_HEADER,
TAG_HGROUP,
TAG_HR,
TAG_OL,
TAG_OUTPUT,
TAG_P,
TAG_PRE,
TAG_SECTION,
TAG_TABLE,
TAG_TFOOT,
TAG_TBODY,
TAG_THEAD,
TAG_UL,
TAG_VIDEO,
TAG_LI,
TAG_TR,
TAG_XBODY,
TAG_XHTML,
};
static const HtmlTags html_tags[] = {
{NULL, 0, TAG_SKIP},
{const_str_len("s"), TAG_IS_ALLOWED},
{const_str_len("u"), TAG_IS_ALLOWED},
{const_str_len("b"), TAG_IS_ALLOWED},
{const_str_len("i"), TAG_IS_ALLOWED},
{const_str_len("img"), TAG_ALT | TAG_ALLOW_SELF_CLOSURE},
{const_str_len("a"), TAG_SAVE_ATTRS | TAG_IS_ALLOWED},
{const_str_len("br"), TAG_IS_BREAK | TAG_ALLOW_SELF_CLOSURE},
{const_str_len("head"), TAG_REPLACE_INNERS},
{const_str_len("style"), TAG_REPLACE_INNERS},
{const_str_len("title"), TAG_REPLACE_INNERS},
{const_str_len("script"), TAG_REPLACE_INNERS},
{const_str_len("textarea"), TAG_REPLACE_INNERS},
{const_str_len("noscript"), TAG_IS_BLOCK},
{const_str_len("html"), TAG_SKIP},
{const_str_len("td"), TAG_SKIP},
{const_str_len("address"), TAG_IS_BLOCK},
{const_str_len("article"), TAG_IS_BLOCK},
{const_str_len("aside"), TAG_IS_BLOCK},
{const_str_len("audio"), TAG_IS_BLOCK},
{const_str_len("blockquote"), TAG_IS_BLOCK},
{const_str_len("canvas"), TAG_IS_BLOCK},
{const_str_len("dd"), TAG_IS_BLOCK},
{const_str_len("div"), TAG_IS_BLOCK},
{const_str_len("dl"), TAG_IS_BLOCK},
{const_str_len("fieldset"), TAG_IS_BLOCK},
{const_str_len("figcaption"), TAG_IS_BLOCK},
{const_str_len("figure"), TAG_IS_BLOCK},
{const_str_len("footer"), TAG_IS_BLOCK},
{const_str_len("form"), TAG_IS_BLOCK},
{const_str_len("h1"), TAG_IS_BLOCK},
{const_str_len("h2"), TAG_IS_BLOCK},
{const_str_len("h3"), TAG_IS_BLOCK},
{const_str_len("h4"), TAG_IS_BLOCK},
{const_str_len("h5"), TAG_IS_BLOCK},
{const_str_len("h6"), TAG_IS_BLOCK},
{const_str_len("header"), TAG_IS_BLOCK},
{const_str_len("hgroup"), TAG_IS_BLOCK},
{const_str_len("hr"), TAG_IS_BLOCK},
{const_str_len("ol"), TAG_IS_BLOCK},
{const_str_len("output"), TAG_IS_BLOCK},
{const_str_len("p"), TAG_IS_BLOCK},
{const_str_len("pre"), TAG_IS_BLOCK},
{const_str_len("section"), TAG_IS_BLOCK},
{const_str_len("table"), TAG_IS_BLOCK},
{const_str_len("tfoot"), TAG_IS_BLOCK},
{const_str_len("tbody"), TAG_IS_BLOCK},
{const_str_len("thead"), TAG_IS_BLOCK},
{const_str_len("ul"), TAG_IS_BLOCK},
{const_str_len("video"), TAG_IS_BLOCK},
{const_str_len("li"), TAG_IS_BLOCK},
{const_str_len("tr"), TAG_IS_BLOCK},
{const_str_len("xbody"), TAG_SKIP},
{const_str_len("xhtml"), TAG_SKIP},
};
enum {
FLAGS_CLEAR_SPACES = 1,
FLAGS_DEBUG = 2
};
enum {
TOK_UNK = 0,
TOK_TAG,
TOK_COMMENT,
TOK_DOCTYPE,
TOK_CDATA,
TOK_COMMENT2,
};
extern "C" t_str optimize_html(char *text_val, unsigned int text_len, unsigned int flags = 0, bool dup = false);
#endif