1 /********************************************************************
2 Copyright (c) 2003-8, WebThing Ltd
3 Author: Nick Kew <nick@webthing.com>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License Version 2,
7 as published by the Free Software Foundation.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You can obtain a copy of the GNU General Poblic License Version 2
15 from http://www.gnu.org/licenses/old-licenses/gpl-2.0.html or
16 http://apache.webthing.com/COPYING.txt
18 *********************************************************************/
21 /********************************************************************
24 You are requested to register as a user, at
25 http://apache.webthing.com/registration.html
27 This entitles you to support from the developer.
28 I'm unlikely to reply to help/support requests from
29 non-registered users, unless you're paying and/or offering
30 constructive feedback such as bug reports or sensible
31 suggestions for further development.
33 It also makes a small contribution to the effort
34 that's gone into developing this work.
35 *********************************************************************/
44 You can #define GO_FASTER to disable informational logging.
45 This disables the ProxyHTMLLogVerbose option altogether.
47 Default is to leave it undefined, and enable verbose logging
48 as a configuration option. Binaries are supplied with verbose
56 #define VERBOSE(x) if (verbose) x
57 #define VERBOSEB(x) if (verbose) {x}
60 #define VERSION_STRING "proxy_html/3.0.1"
65 #include <libxml/HTMLparser.h>
68 #include <http_protocol.h>
69 #include <http_config.h>
71 #include <apr_strings.h>
73 #include <apr_xlate.h>
75 /* To support Apache 2.1/2.2, we need the ap_ forms of the
76 * regexp stuff, and they're now used in the code.
77 * To support 2.0 in the same compile, * we #define the
78 * AP_ versions if necessary.
81 /* it's 2.0, so we #define the ap_ versions */
82 #define ap_regex_t regex_t
83 #define ap_regmatch_t regmatch_t
84 #define AP_REG_EXTENDED REG_EXTENDED
85 #define AP_REG_ICASE REG_ICASE
86 #define AP_REG_NOSUB REG_NOSUB
87 #define AP_REG_NEWLINE REG_NEWLINE
89 #define ap_register_output_filter_protocol(a,b,c,d,e) ap_register_output_filter(a,b,c,d)
94 module AP_MODULE_DECLARE_DATA proxy_html_module ;
100 #define M_ATSTART 0x10
103 #define M_NOTLAST 0x80
104 #define M_INTERPOLATE_TO 0x100
105 #define M_INTERPOLATE_FROM 0x200
119 typedef struct urlmap {
120 struct urlmap* next ;
122 unsigned int regflags ;
132 const char* doctype ;
137 apr_array_header_t* events;
138 apr_array_header_t* skipto;
139 xmlCharEncoding default_encoding;
140 const char* charset_out;
150 apr_xlate_t* convset;
156 proxy_html_conf* cfg ;
157 htmlParserCtxtPtr parser ;
158 apr_bucket_brigade* bb ;
164 const char* encoding;
170 #define NORM_MSSLASH 0x2
171 #define NORM_RESET 0x4
172 static htmlSAXHandler sax ;
174 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t ;
176 static const char* const fpi_html =
177 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n" ;
178 static const char* const fpi_html_legacy =
179 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" ;
180 static const char* const fpi_xhtml =
181 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ;
182 static const char* const fpi_xhtml_legacy =
183 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" ;
184 static const char* const html_etag = ">" ;
185 static const char* const xhtml_etag = " />" ;
186 /*#define DEFAULT_DOCTYPE fpi_html */
187 static const char* const DEFAULT_DOCTYPE = "" ;
188 #define DEFAULT_ETAG html_etag
190 static void normalise(unsigned int flags, char* str) {
192 if ( flags & NORM_LC )
193 for ( p = str ; *p ; ++p )
197 if ( flags & NORM_MSSLASH )
198 for ( p = ap_strchr_c(str, '\\') ; p ; p = ap_strchr_c(p+1, '\\') )
202 static void consume_buffer(saxctxt* ctx, const char* inbuf,
203 int bytes, int flag) {
208 int verbose = ctx->cfg->verbose;
210 if (ctx->conv_in == NULL) {
211 /* just feed it to libxml2 */
212 htmlParseChunk(ctx->parser, inbuf, bytes, flag) ;
215 if (ctx->conv_in->bytes > 0) {
216 /* FIXME: make this a reusable buf? */
217 buf = apr_palloc(ctx->f->r->pool, ctx->conv_in->bytes + bytes);
218 memcpy(buf, ctx->conv_in->buf, ctx->conv_in->bytes);
219 memcpy(buf + ctx->conv_in->bytes, inbuf, bytes);
220 bytes += ctx->conv_in->bytes;
221 ctx->conv_in->bytes = 0;
228 apr_size_t outsz = 4096;
229 rv = apr_xlate_conv_buffer(ctx->conv_in->convset,
230 buf + (bytes - insz), &insz,
232 htmlParseChunk(ctx->parser, outbuf, 4096-outsz, flag) ;
236 case APR_EINCOMPLETE:
237 if (insz < 32) {/* save dangling byte(s) and return */
238 ctx->conv_in->bytes = insz;
239 ctx->conv_in->buf = (buf != inbuf) ? buf + (bytes-insz)
240 : apr_pmemdup(ctx->f->r->pool, buf + (bytes-insz), insz);
242 } else { /*OK, maybe 4096 wasn't big enough, and ended mid-char */
245 case APR_EINVAL: /* try skipping one bad byte */
246 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, ctx->f->r,
247 "Skipping invalid byte in input stream!") ) ;
251 /* Erk! What's this? Bail out and eat the buf raw
252 * if libxml2 will accept it!
254 ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, ctx->f->r,
255 "Failed to convert input; trying it raw") ;
256 htmlParseChunk(ctx->parser, buf + (bytes - insz), insz, flag) ;
257 ctx->conv_in = NULL; /* don't try converting any more */
262 static void AP_fwrite(saxctxt* ctx, const char* inbuf, int bytes, int flush) {
263 /* convert charset if necessary, and output */
268 int verbose = ctx->cfg->verbose;
271 if (ctx->conv_out == NULL) {
272 ap_fwrite(ctx->f->next, ctx->bb, inbuf, bytes);
275 if (ctx->conv_out->bytes > 0) {
276 /* FIXME: make this a reusable buf? */
277 buf = apr_palloc(ctx->f->r->pool, ctx->conv_out->bytes + bytes);
278 memcpy(buf, ctx->conv_out->buf, ctx->conv_out->bytes);
279 memcpy(buf + ctx->conv_out->bytes, inbuf, bytes);
280 bytes += ctx->conv_out->bytes;
281 ctx->conv_out->bytes = 0;
288 apr_size_t outsz = 2048;
289 rv = apr_xlate_conv_buffer(ctx->conv_out->convset,
290 buf + (bytes - insz), &insz,
292 ap_fwrite(ctx->f->next, ctx->bb, outbuf, 2048-outsz) ;
296 case APR_EINCOMPLETE: /* save dangling byte(s) and return */
297 /* but if we need to flush, just abandon them */
298 if ( flush) { /* if we're flushing, this must be complete */
299 /* so this is an error */
300 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, ctx->f->r,
301 "Skipping invalid byte in output stream!") ) ;
303 ctx->conv_out->bytes = insz;
304 ctx->conv_out->buf = (buf != inbuf) ? buf + (bytes-insz)
305 : apr_pmemdup(ctx->f->r->pool, buf + (bytes-insz), insz);
308 case APR_EINVAL: /* try skipping one bad byte */
309 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, ctx->f->r,
310 "Skipping invalid byte in output stream!") ) ;
314 /* Erk! What's this? Bail out and pass the buf raw
315 * if libxml2 will accept it!
317 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, ctx->f->r,
318 "Failed to convert output; sending UTF-8") ) ;
319 ap_fwrite(ctx->f->next, ctx->bb, buf + (bytes - insz), insz) ;
325 /* This is always utf-8 on entry. We can convert charset within FLUSH */
326 #define FLUSH AP_fwrite(ctx, (chars+begin), (i-begin), 0) ; begin = i+1
327 static void pcharacters(void* ctxt, const xmlChar *uchars, int length) {
328 const char* chars = (const char*) uchars;
329 saxctxt* ctx = (saxctxt*) ctxt ;
332 for ( begin=i=0; i<length; i++ ) {
334 case '&' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "&") ; break ;
335 case '<' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "<") ; break ;
336 case '>' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, ">") ; break ;
337 case '"' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, """) ; break ;
343 static void preserve(saxctxt* ctx, const size_t len) {
345 if ( len <= ( ctx->avail - ctx->offset ) )
347 else while ( len > ( ctx->avail - ctx->offset ) )
348 ctx->avail += ctx->cfg->bufsz ;
350 newbuf = realloc(ctx->buf, ctx->avail) ;
351 if ( newbuf != ctx->buf ) {
353 apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf, (void*)free) ;
354 apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
355 (void*)free, apr_pool_cleanup_null);
359 static void pappend(saxctxt* ctx, const char* buf, const size_t len) {
361 memcpy(ctx->buf+ctx->offset, buf, len) ;
364 static void dump_content(saxctxt* ctx) {
367 size_t s_from, s_to ;
371 ap_regmatch_t pmatch[10] ;
374 urlmap* themap = ctx->map;
376 int verbose = ctx->cfg->verbose ;
379 pappend(ctx, &c, 1) ; /* append null byte */
380 /* parse the text for URLs */
381 for ( m = themap ; m ; m = m->next ) {
382 if ( ! ( m->flags & M_CDATA ) )
384 if ( m->flags & M_REGEX ) {
387 while ( ! ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0) ) {
388 match = pmatch[0].rm_so ;
389 s_from = pmatch[0].rm_eo - match ;
390 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
392 s_to = strlen(subs) ;
393 len = strlen(ctx->buf) ;
396 const char* f = apr_pstrndup(ctx->f->r->pool,
397 ctx->buf + offs , s_from ) ;
398 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
399 "C/RX: match at %s, substituting %s", f, subs) ;
401 if ( s_to > s_from) {
402 preserve(ctx, s_to - s_from) ;
403 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
404 len + 1 - s_from - offs) ;
405 memcpy(ctx->buf+offs, subs, s_to) ;
407 memcpy(ctx->buf + offs, subs, s_to) ;
408 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
409 len + 1 - s_from - offs) ;
414 s_from = strlen(m->from.c) ;
415 s_to = strlen(m->to) ;
416 for ( found = strstr(ctx->buf, m->from.c) ; found ;
417 found = strstr(ctx->buf+match+s_to, m->from.c) ) {
418 match = found - ctx->buf ;
419 if ( ( m->flags & M_ATSTART ) && ( match != 0) )
421 len = strlen(ctx->buf) ;
422 if ( ( m->flags & M_ATEND ) && ( match < (len - s_from) ) )
424 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
425 "C: matched %s, substituting %s", m->from.c, m->to) ) ;
426 if ( s_to > s_from ) {
427 preserve(ctx, s_to - s_from) ;
428 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
429 len + 1 - s_from - match) ;
430 memcpy(ctx->buf+match, m->to, s_to) ;
432 memcpy(ctx->buf+match, m->to, s_to) ;
433 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
434 len + 1 - s_from - match) ;
439 AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1) ;
441 static void pcdata(void* ctxt, const xmlChar *uchars, int length) {
442 const char* chars = (const char*) uchars;
443 saxctxt* ctx = (saxctxt*) ctxt ;
444 if ( ctx->cfg->extfix ) {
445 pappend(ctx, chars, length) ;
447 /* not sure if this should force-flush
448 * (i.e. can one cdata section come in multiple calls?)
450 AP_fwrite(ctx, chars, length, 0) ;
453 static void pcomment(void* ctxt, const xmlChar *uchars) {
454 const char* chars = (const char*) uchars;
455 saxctxt* ctx = (saxctxt*) ctxt ;
456 if ( ctx->cfg->strip_comments )
459 if ( ctx->cfg->extfix ) {
460 pappend(ctx, "<!--", 4) ;
461 pappend(ctx, chars, strlen(chars) ) ;
462 pappend(ctx, "-->", 3) ;
464 ap_fputs(ctx->f->next, ctx->bb, "<!--") ;
465 AP_fwrite(ctx, chars, strlen(chars), 1) ;
466 ap_fputs(ctx->f->next, ctx->bb, "-->") ;
469 static void pendElement(void* ctxt, const xmlChar* uname) {
470 saxctxt* ctx = (saxctxt*) ctxt ;
471 const char* name = (const char*) uname;
472 const htmlElemDesc* desc = htmlTagLookup(uname);
474 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
476 if (!desc || desc->depr)
479 } else if ((ctx->cfg->doctype == fpi_html)
480 || (ctx->cfg->doctype == fpi_xhtml)) {
481 /* enforce html legacy */
485 /* TODO - implement HTML "allowed here" using the stack */
486 /* nah. Keeping the stack is too much overhead */
488 if ( ctx->offset > 0 ) {
490 ctx->offset = 0 ; /* having dumped it, we can re-use the memory */
492 if ( !desc || ! desc->empty ) {
493 ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name) ;
496 static void pstartElement(void* ctxt, const xmlChar* uname,
497 const xmlChar** uattrs ) {
506 size_t s_to, s_from, match ;
508 saxctxt* ctx = (saxctxt*) ctxt ;
510 ap_regmatch_t pmatch[10] ;
512 int verbose = ctx->cfg->verbose ;
514 apr_array_header_t *linkattrs;
516 const char* name = (const char*) uname;
517 const char** attrs = (const char**) uattrs;
518 const htmlElemDesc* desc = htmlTagLookup(uname);
519 urlmap* themap = ctx->map;
524 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
527 if (!desc || desc->depr)
530 } else if ((ctx->cfg->doctype == fpi_html)
531 || (ctx->cfg->doctype == fpi_xhtml)) {
533 /* enforce html legacy */
538 if (!desc && enforce) {
539 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
540 "Bogus HTML element %s dropped", name) ;
543 if (desc && desc->depr && (enforce == 2) ) {
544 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
545 "Deprecated HTML element %s dropped", name) ;
549 descp = apr_array_push(ctx->stack);
551 /* TODO - implement HTML "allowed here" */
554 ap_fputc(ctx->f->next, ctx->bb, '<') ;
555 ap_fputs(ctx->f->next, ctx->bb, name) ;
558 if ((enforce > 0) && (desc != NULL) && (desc->attrs_req != NULL))
559 for (a = desc->attrs_req; *a; a++)
563 linkattrs = apr_hash_get(ctx->cfg->links, name, APR_HASH_KEY_STRING) ;
564 for ( a = attrs ; *a ; a += 2 ) {
565 if (desc && enforce > 0) {
566 switch (htmlAttrAllowed(desc, (xmlChar*)*a, 2-enforce)) {
568 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
569 "Bogus HTML attribute %s of %s dropped", *a, name);
571 case HTML_DEPRECATED:
572 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
573 "Deprecated HTML attribute %s of %s dropped", *a, name);
576 required_attrs--; /* cross off the number still needed */
577 /* fallthrough - required implies valid */
584 pappend(ctx, a[1], strlen(a[1])+1) ;
585 is_uri = ATTR_IGNORE ;
587 tattr* attrs = (tattr*) linkattrs->elts;
588 for (i=0; i < linkattrs->nelts; ++i) {
589 if ( !strcmp(*a, attrs[i].val)) {
595 if ( (is_uri == ATTR_IGNORE) && ctx->cfg->extfix
596 && (ctx->cfg->events != NULL) ) {
597 for (i=0; i < ctx->cfg->events->nelts; ++i) {
598 tattr* attrs = (tattr*) ctx->cfg->events->elts;
599 if ( !strcmp(*a, attrs[i].val)) {
600 is_uri = ATTR_EVENT ;
608 for ( m = themap ; m ; m = m->next ) {
609 if ( ! ( m->flags & M_HTML ) )
611 if ( m->flags & M_REGEX ) {
613 if ( ! ap_regexec(m->from.r, ctx->buf, nmatch, pmatch, 0) ) {
615 offs = match = pmatch[0].rm_so ;
616 s_from = pmatch[0].rm_eo - match ;
617 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
620 const char* f = apr_pstrndup(ctx->f->r->pool,
621 ctx->buf + offs , s_from ) ;
622 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
623 "H/RX: match at %s, substituting %s", f, subs) ;
625 s_to = strlen(subs) ;
626 len = strlen(ctx->buf) ;
627 if ( s_to > s_from) {
628 preserve(ctx, s_to - s_from) ;
629 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
630 len + 1 - s_from - offs) ;
631 memcpy(ctx->buf+offs, subs, s_to) ;
633 memcpy(ctx->buf + offs, subs, s_to) ;
634 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
635 len + 1 - s_from - offs) ;
639 s_from = strlen(m->from.c) ;
640 if ( ! strncasecmp(ctx->buf, m->from.c, s_from ) ) {
642 s_to = strlen(m->to) ;
643 len = strlen(ctx->buf) ;
644 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
645 "H: matched %s, substituting %s", m->from.c, m->to) ) ;
646 if ( s_to > s_from ) {
647 preserve(ctx, s_to - s_from) ;
648 memmove(ctx->buf+s_to, ctx->buf+s_from,
650 memcpy(ctx->buf, m->to, s_to) ;
651 } else { /* it fits in the existing space */
652 memcpy(ctx->buf, m->to, s_to) ;
653 memmove(ctx->buf+s_to, ctx->buf+s_from,
659 /* URIs only want one match unless overridden in the config */
660 if ( (num_match > 0) && !( m->flags & M_NOTLAST ) )
665 for ( m = themap ; m ; m = m->next ) {
666 num_match = 0 ; /* reset here since we're working per-rule */
667 if ( ! ( m->flags & M_EVENTS ) )
669 if ( m->flags & M_REGEX ) {
672 while ( ! ap_regexec(m->from.r, ctx->buf+offs,
673 nmatch, pmatch, 0) ) {
674 match = pmatch[0].rm_so ;
675 s_from = pmatch[0].rm_eo - match ;
676 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
679 const char* f = apr_pstrndup(ctx->f->r->pool,
680 ctx->buf + offs , s_from ) ;
681 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
682 "E/RX: match at %s, substituting %s", f, subs) ;
684 s_to = strlen(subs) ;
686 len = strlen(ctx->buf) ;
687 if ( s_to > s_from) {
688 preserve(ctx, s_to - s_from) ;
689 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
690 len + 1 - s_from - offs) ;
691 memcpy(ctx->buf+offs, subs, s_to) ;
693 memcpy(ctx->buf + offs, subs, s_to) ;
694 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
695 len + 1 - s_from - offs) ;
701 found = strstr(ctx->buf, m->from.c) ;
702 if ( (m->flags & M_ATSTART) && ( found != ctx->buf) )
705 s_from = strlen(m->from.c) ;
706 s_to = strlen(m->to) ;
707 match = found - ctx->buf ;
708 if ( ( s_from < strlen(found) ) && (m->flags & M_ATEND ) ) {
709 found = strstr(ctx->buf+match+s_from, m->from.c) ;
712 found = strstr(ctx->buf+match+s_to, m->from.c) ;
714 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
715 "E: matched %s, substituting %s", m->from.c, m->to) ) ;
716 len = strlen(ctx->buf) ;
717 if ( s_to > s_from ) {
718 preserve(ctx, s_to - s_from) ;
719 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
720 len + 1 - s_from - match) ;
721 memcpy(ctx->buf+match, m->to, s_to) ;
723 memcpy(ctx->buf+match, m->to, s_to) ;
724 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
725 len + 1 - s_from - match) ;
730 if ( num_match && ( m->flags & M_LAST ) )
739 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL) ;
742 if ( ctx->cfg->flags != 0 )
743 normalise(ctx->cfg->flags, ctx->buf) ;
745 /* write the attribute, using pcharacters to html-escape
746 anything that needs it in the value.
748 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL) ;
749 pcharacters(ctx, (const xmlChar*)ctx->buf, strlen(ctx->buf)) ;
750 ap_fputc(ctx->f->next, ctx->bb, '"') ;
755 if ( desc && desc->empty )
756 ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag) ;
758 ap_fputc(ctx->f->next, ctx->bb, '>') ;
760 if ((enforce > 0) && (required_attrs > 0)) {
761 /* if there are more required attributes than we found then complain */
762 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
763 "HTML element %s is missing %d required attributes",
764 name, required_attrs);
768 /* globals set once at startup */
769 static ap_regex_t* seek_meta_ctype ;
770 static ap_regex_t* seek_charset ;
771 static ap_regex_t* seek_meta ;
773 static xmlCharEncoding sniff_encoding(saxctxt* ctx, const char* cbuf,
776 int verbose = ctx->cfg->verbose;
778 request_rec* r = ctx->f->r ;
779 proxy_html_conf* cfg = ctx->cfg ;
780 xmlCharEncoding ret ;
782 ap_regmatch_t match[2] ;
783 char* buf = (char*)cbuf ;
784 apr_xlate_t* convset;
786 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
787 "Content-Type is %s", r->content_type) ) ;
789 /* If we've got it in the HTTP headers, there's nothing to do */
790 if ( r->content_type &&
791 ( p = ap_strcasestr(r->content_type, "charset=") , p > 0 ) ) {
793 if ( ctx->encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ) ,
795 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
796 "Got charset %s from HTTP headers", ctx->encoding) ) ;
797 if ( ret = xmlParseCharEncoding(ctx->encoding),
798 ((ret != XML_CHAR_ENCODING_ERROR )
799 && (ret != XML_CHAR_ENCODING_NONE))) {
805 /* to sniff, first we look for BOM */
806 if (ctx->encoding == NULL) {
807 if ( ret = xmlDetectCharEncoding((const xmlChar*)buf, bytes),
808 ret != XML_CHAR_ENCODING_NONE ) {
809 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
810 "Got charset from XML rules.") ) ;
814 /* If none of the above, look for a META-thingey */
815 if ( ap_regexec(seek_meta_ctype, buf, 1, match, 0) == 0 ) {
816 p = apr_pstrndup(r->pool, buf + match[0].rm_so,
817 match[0].rm_eo - match[0].rm_so) ;
818 if ( ap_regexec(seek_charset, p, 2, match, 0) == 0 )
819 ctx->encoding = apr_pstrndup(r->pool, p+match[1].rm_so,
820 match[1].rm_eo - match[1].rm_so) ;
824 /* either it's set to something we found or it's still the default */
825 if ( ctx->encoding ) {
826 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
827 "Got charset %s from HTML META", ctx->encoding) ) ;
828 if ( ret = xmlParseCharEncoding(ctx->encoding),
829 ((ret != XML_CHAR_ENCODING_ERROR )
830 && (ret != XML_CHAR_ENCODING_NONE))) {
833 /* Unsupported charset. Can we get (iconv) support through apr_xlate? */
834 /* Aaargh! libxml2 has undocumented <META-crap> support. So this fails
835 * if metafix is not active. Have to make it conditional.
838 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
839 "Charset %s not supported by libxml2; trying apr_xlate", ctx->encoding) ) ;
840 if (apr_xlate_open(&convset, "UTF-8", ctx->encoding, r->pool) == APR_SUCCESS) {
841 ctx->conv_in = apr_pcalloc(r->pool, sizeof(conv_t));
842 ctx->conv_in->convset = convset ;
843 return XML_CHAR_ENCODING_UTF8 ;
845 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
846 "Charset %s not supported. Consider aliasing it?", ctx->encoding) ;
849 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
850 "Charset %s not supported. Consider aliasing it or use metafix?",
856 /* Use configuration default as a last resort */
857 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r,
858 "No usable charset information; using configuration default") ;
859 return (cfg->default_encoding == XML_CHAR_ENCODING_NONE)
860 ? XML_CHAR_ENCODING_8859_1 : cfg->default_encoding ;
862 static meta* metafix(request_rec* r, const char* buf /*, size_t bytes*/
873 ap_regmatch_t pmatch[2] ;
876 while ( ! ap_regexec(seek_meta, buf+offs, 2, pmatch, 0) ) {
879 p = buf+offs+pmatch[1].rm_eo ;
880 while ( !isalpha(*++p) ) ;
881 for ( q = p ; isalnum(*q) || (*q == '-') ; ++q ) ;
882 header = apr_pstrndup(r->pool, p, q-p) ;
883 if ( strncasecmp(header, "Content-", 8) ) {
884 /* find content=... string */
885 for ( p = ap_strstr((char*)buf+offs+pmatch[0].rm_so, "content") ; *p ; ) {
887 while ( *p && isspace(*p) )
891 while ( *p && isspace(*++p) ) ;
892 if ( ( *p == '\'' ) || ( *p == '"' ) ) {
894 for ( q = p ; *q != delim ; ++q ) ;
896 for ( q = p ; *q && !isspace(*q) && (*q != '>') ; ++q ) ;
898 content = apr_pstrndup(r->pool, p, q-p) ;
901 } else if ( !strncasecmp(header, "Content-Type", 12) ) {
902 ret = apr_palloc(r->pool, sizeof(meta) ) ;
903 ret->start = pmatch[0].rm_so ;
904 ret->end = pmatch[0].rm_eo ;
906 if ( header && content ) {
907 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
908 "Adding header [%s: %s] from HTML META", header, content) ) ;
909 apr_table_setn(r->headers_out, header, content) ;
911 offs += pmatch[0].rm_eo ;
916 static const char* interpolate_vars(request_rec* r, const char* str) {
922 const char* replacement;
926 if (start = ap_strstr_c(start, "${"), start == NULL)
929 if (end = ap_strchr_c(start+2, '}'), end == NULL)
932 delim = ap_strchr_c(start, '|');
933 before = apr_pstrndup(r->pool, str, start-str);
936 var = apr_pstrndup(r->pool, start+2, delim-start-2) ;
938 var = apr_pstrndup(r->pool, start+2, end-start-2) ;
940 replacement = apr_table_get(r->subprocess_env, var) ;
943 replacement = apr_pstrndup(r->pool, delim+1, end-delim-1);
946 str = apr_pstrcat(r->pool, before, replacement, after, NULL);
947 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
948 "Interpolating %s => %s", var, replacement) ;
952 static void fixup_rules(saxctxt* ctx) {
957 request_rec* r = ctx->f->r;
960 for (p = ctx->cfg->map; p; p = p->next) {
962 if (p->cond != NULL) {
963 thisval = apr_table_get(r->subprocess_env, p->cond->env);
965 /* required to be "anything" */
967 has_cond = 1; /* satisfied */
969 has_cond = 0; /* unsatisfied */
971 if (thisval && !strcasecmp(p->cond->val, thisval)) {
972 has_cond = 1; /* satisfied */
974 has_cond = 0; /* unsatisfied */
977 if (((has_cond == 0) && (p->cond->rel ==1 ))
978 || ((has_cond == 1) && (p->cond->rel == -1))) {
979 continue; /* condition is unsatisfied */
983 newp = apr_pmemdup(r->pool, p, sizeof(urlmap));
985 if (newp->flags & M_INTERPOLATE_FROM) {
986 newp->from.c = interpolate_vars(r, newp->from.c);
987 if (!newp->from.c || !*newp->from.c)
988 continue; /* don't use empty from-pattern */
989 if (newp->flags & M_REGEX) {
990 newp->from.r = ap_pregcomp(r->pool, newp->from.c, newp->regflags) ;
993 if (newp->flags & M_INTERPOLATE_TO) {
994 newp->to = interpolate_vars(r, newp->to);
996 /* evaluate p->cond; continue if unsatisfied */
997 /* create new urlmap with memcpy and append to map */
998 /* interpolate from if flagged to do so */
999 /* interpolate to if flagged to do so */
1011 static saxctxt* check_filter_init (ap_filter_t* f) {
1014 proxy_html_conf* cfg
1015 = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
1016 const char* force = apr_table_get(f->r->subprocess_env, "PROXY_HTML_FORCE");
1018 const char* errmsg = NULL ;
1020 if ( ! f->r->proxyreq ) {
1021 errmsg = "Non-proxy request; not inserting proxy-html filter" ;
1022 } else if ( ! f->r->content_type ) {
1023 errmsg = "No content-type; bailing out of proxy-html filter" ;
1024 } else if ( strncasecmp(f->r->content_type, "text/html", 9) &&
1025 strncasecmp(f->r->content_type, "application/xhtml+xml", 21) ) {
1026 errmsg = "Non-HTML content; not inserting proxy-html filter" ;
1030 errmsg = "No links configured: nothing for proxy-html filter to do";
1035 if ( cfg->verbose ) {
1036 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, f->r, errmsg) ;
1039 ap_remove_output_filter(f) ;
1043 fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt)) ;
1045 fctx->bb = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc) ;
1047 apr_table_unset(f->r->headers_out, "Content-Length") ;
1052 fctx->map = cfg->map;
1053 /* defer dealing with charset_out until after sniffing charset_in
1054 * so we can support setting one to t'other.
1059 static int proxy_html_filter(ap_filter_t* f, apr_bucket_brigade* bb) {
1060 apr_xlate_t* convset;
1061 const char* charset = NULL;
1064 xmlCharEncoding enc ;
1065 const char* buf = 0 ;
1066 apr_size_t bytes = 0 ;
1067 #ifndef USE_OLD_LIBXML2
1068 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
1069 XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING ;
1072 saxctxt* ctxt = check_filter_init(f) ;
1077 return ap_pass_brigade(f->next, bb) ;
1079 verbose = ctxt->cfg->verbose;
1082 for ( b = APR_BRIGADE_FIRST(bb) ;
1083 b != APR_BRIGADE_SENTINEL(bb) ;
1084 b = APR_BUCKET_NEXT(b) ) {
1085 if ( APR_BUCKET_IS_METADATA(b) ) {
1086 if ( APR_BUCKET_IS_EOS(b) ) {
1087 if ( ctxt->parser != NULL ) {
1088 consume_buffer(ctxt, buf, 0, 1);
1090 APR_BRIGADE_INSERT_TAIL(ctxt->bb,
1091 apr_bucket_eos_create(ctxt->bb->bucket_alloc) ) ;
1092 ap_pass_brigade(ctxt->f->next, ctxt->bb) ;
1093 } else if ( APR_BUCKET_IS_FLUSH(b) ) {
1094 /* pass on flush, except at start where it would cause
1095 * headers to be sent before doc sniffing
1097 if ( ctxt->parser != NULL ) {
1098 ap_fflush(ctxt->f->next, ctxt->bb) ;
1101 } else if ( apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
1103 if ( ctxt->parser == NULL ) {
1104 if ( buf[bytes] != 0 ) {
1105 /* make a string for parse routines to play with */
1106 char* buf1 = apr_palloc(f->r->pool, bytes+1) ;
1107 memcpy(buf1, buf, bytes) ;
1111 /* For publishing systems that insert crap at the head of a
1112 * page that buggers up the parser. Search to first instance
1113 * of some relatively sane, or at least parseable, element.
1115 if (ctxt->cfg->skipto != NULL) {
1116 char* p = ap_strchr_c(buf, '<');
1117 tattr* starts = (tattr*) ctxt->cfg->skipto->elts;
1119 while (!found && *p) {
1121 for (i = 0; i < ctxt->cfg->skipto->nelts; ++i) {
1122 if ( !strncasecmp(p+1, starts[i].val, strlen(starts[i].val))) {
1127 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
1128 "Skipped to first <%s> element", starts[i].val)
1133 p = ap_strchr_c(p+1, '<');
1136 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
1137 "Failed to find start of recognised HTML!") ;
1141 enc = sniff_encoding(ctxt, buf, bytes) ;
1142 /* now we have input charset, set output charset too */
1143 if (ctxt->cfg->charset_out) {
1144 if (!strcmp(ctxt->cfg->charset_out, "*"))
1145 charset = ctxt->encoding;
1147 charset = ctxt->cfg->charset_out;
1148 if (strcasecmp(charset, "utf-8")) {
1149 if (apr_xlate_open(&convset, charset, "UTF-8",
1150 f->r->pool) == APR_SUCCESS) {
1151 ctxt->conv_out = apr_pcalloc(f->r->pool, sizeof(conv_t));
1152 ctxt->conv_out->convset = convset;
1154 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
1155 "Output charset %s not supported. Falling back to UTF-8",
1160 if (ctxt->conv_out) {
1161 const char* ctype = apr_psprintf(f->r->pool,
1162 "text/html;charset=%s", charset);
1163 ap_set_content_type(f->r, ctype) ;
1165 ap_set_content_type(f->r, "text/html;charset=utf-8") ;
1167 ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype) ;
1168 ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, buf, 4, 0, enc) ;
1171 if (ctxt->parser == NULL) {
1172 apr_status_t rv = ap_pass_brigade(f->next, bb) ;
1173 ap_remove_output_filter(f) ;
1176 apr_pool_cleanup_register(f->r->pool, ctxt->parser,
1177 (void*)htmlFreeParserCtxt, apr_pool_cleanup_null) ;
1178 #ifndef USE_OLD_LIBXML2
1179 if ( xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts ), xmlopts )
1180 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
1181 "Unsupported parser opts %x", xmlopts) ;
1183 if ( ctxt->cfg->metafix )
1185 m = metafix(f->r, buf, ctxt->cfg->verbose) ;
1187 m = metafix(f->r, buf) ;
1190 consume_buffer(ctxt, buf, m->start, 0) ;
1191 consume_buffer(ctxt, buf+m->end, bytes-m->end, 0) ;
1193 consume_buffer(ctxt, buf, bytes, 0) ;
1196 consume_buffer(ctxt, buf, bytes, 0) ;
1199 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, "Error in bucket read") ;
1202 /*ap_fflush(ctxt->f->next, ctxt->bb) ; // uncomment for debug */
1203 apr_brigade_cleanup(bb) ;
1204 return APR_SUCCESS ;
1207 static void* proxy_html_config(apr_pool_t* pool, char* x) {
1208 proxy_html_conf* ret = apr_pcalloc(pool, sizeof(proxy_html_conf) ) ;
1209 ret->doctype = DEFAULT_DOCTYPE ;
1210 ret->etag = DEFAULT_ETAG ;
1212 ret->default_encoding = XML_CHAR_ENCODING_NONE ;
1213 /* ret->interp = 1; */
1214 /* don't initialise links and events until they get set/used */
1217 static void* proxy_html_merge(apr_pool_t* pool, void* BASE, void* ADD) {
1218 proxy_html_conf* base = (proxy_html_conf*) BASE ;
1219 proxy_html_conf* add = (proxy_html_conf*) ADD ;
1220 proxy_html_conf* conf = apr_palloc(pool, sizeof(proxy_html_conf)) ;
1222 /* don't merge declarations - just use the most specific */
1223 conf->links = (add->links == NULL) ? base->links : add->links;
1224 conf->events = (add->events == NULL) ? base->events : add->events;
1226 conf->default_encoding = (add->default_encoding == XML_CHAR_ENCODING_NONE)
1227 ? base->default_encoding : add->default_encoding ;
1228 conf->charset_out = (add->charset_out == NULL)
1229 ? base->charset_out : add->charset_out ;
1231 if ( add->map && base->map ) {
1234 for ( a = base->map ; a ; a = a->next ) {
1235 urlmap* save = conf->map ;
1236 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
1237 conf->map->next = save ;
1239 for ( a = add->map ; a ; a = a->next ) {
1240 urlmap* save = conf->map ;
1241 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
1242 conf->map->next = save ;
1245 conf->map = add->map ? add->map : base->map ;
1247 conf->doctype = ( add->doctype == DEFAULT_DOCTYPE )
1248 ? base->doctype : add->doctype ;
1249 conf->etag = ( add->etag == DEFAULT_ETAG ) ? base->etag : add->etag ;
1250 conf->bufsz = add->bufsz ;
1251 if ( add->flags & NORM_RESET ) {
1252 conf->flags = add->flags ^ NORM_RESET ;
1253 conf->metafix = add->metafix ;
1254 conf->extfix = add->extfix ;
1255 conf->interp = add->interp ;
1256 conf->strip_comments = add->strip_comments ;
1257 conf->skipto = add->skipto ;
1259 conf->verbose = add->verbose ;
1262 conf->flags = base->flags | add->flags ;
1263 conf->metafix = base->metafix | add->metafix ;
1264 conf->extfix = base->extfix | add->extfix ;
1265 conf->interp = base->interp | add->interp ;
1266 conf->strip_comments = base->strip_comments | add->strip_comments ;
1267 conf->skipto = add->skipto ? add->skipto : base->skipto ;
1269 conf->verbose = base->verbose | add->verbose ;
1274 #define REGFLAG(n,s,c) ( (s&&(ap_strchr_c((s),(c))!=NULL)) ? (n) : 0 )
1275 #define XREGFLAG(n,s,c) ( (!s||(ap_strchr_c((s),(c))==NULL)) ? (n) : 0 )
1276 static void comp_urlmap(apr_pool_t* pool, urlmap* newmap,
1277 const char* from, const char* to, const char* flags, const char* cond) {
1280 = XREGFLAG(M_HTML,flags,'h')
1281 | XREGFLAG(M_EVENTS,flags,'e')
1282 | XREGFLAG(M_CDATA,flags,'c')
1283 | REGFLAG(M_ATSTART,flags,'^')
1284 | REGFLAG(M_ATEND,flags,'$')
1285 | REGFLAG(M_REGEX,flags,'R')
1286 | REGFLAG(M_LAST,flags,'L')
1287 | REGFLAG(M_NOTLAST,flags,'l')
1288 | REGFLAG(M_INTERPOLATE_TO,flags,'V')
1289 | REGFLAG(M_INTERPOLATE_FROM,flags,'v')
1291 if ( ( newmap->flags & M_INTERPOLATE_FROM)
1292 || ! (newmap->flags & M_REGEX) ) {
1293 newmap->from.c = from ;
1297 = REGFLAG(AP_REG_EXTENDED,flags,'x')
1298 | REGFLAG(AP_REG_ICASE,flags,'i')
1299 | REGFLAG(AP_REG_NOSUB,flags,'n')
1300 | REGFLAG(AP_REG_NEWLINE,flags,'s')
1302 newmap->from.r = ap_pregcomp(pool, from, newmap->regflags) ;
1306 newmap->cond = apr_pcalloc(pool, sizeof(rewritecond));
1307 if (cond[0] == '!') {
1308 newmap->cond->rel = -1;
1309 newmap->cond->env = cond+1;
1311 newmap->cond->rel = 1;
1312 newmap->cond->env = cond;
1314 eq = ap_strchr_c(++cond, '=');
1315 if (eq && (eq != cond)) {
1317 newmap->cond->val = eq+1;
1320 newmap->cond = NULL;
1323 static const char* set_urlmap(cmd_parms* cmd, void* CFG, const char* args) {
1324 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
1326 apr_pool_t* pool = cmd->pool;
1329 "Usage: ProxyHTMLURLMap from-pattern to-pattern [flags] [cond]";
1333 const char* cond = NULL;
1335 if (from = ap_getword_conf(cmd->pool, &args), !from)
1337 if (to = ap_getword_conf(cmd->pool, &args), !to)
1339 flags = ap_getword_conf(cmd->pool, &args);
1340 if (flags && *flags)
1341 cond = ap_getword_conf(cmd->pool, &args);
1345 /* the args look OK, so let's use them */
1346 newmap = apr_palloc(pool, sizeof(urlmap) ) ;
1347 newmap->next = NULL;
1349 for ( map = cfg->map ; map->next ; map = map->next ) ;
1350 map->next = newmap ;
1354 comp_urlmap(cmd->pool, newmap, from, to, flags, cond);
1358 static const char* set_doctype(cmd_parms* cmd, void* CFG, const char* t,
1360 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
1361 if ( !strcasecmp(t, "xhtml") ) {
1362 cfg->etag = xhtml_etag ;
1363 if ( l && !strcasecmp(l, "legacy") )
1364 cfg->doctype = fpi_xhtml_legacy ;
1366 cfg->doctype = fpi_xhtml ;
1367 } else if ( !strcasecmp(t, "html") ) {
1368 cfg->etag = html_etag ;
1369 if ( l && !strcasecmp(l, "legacy") )
1370 cfg->doctype = fpi_html_legacy ;
1372 cfg->doctype = fpi_html ;
1374 cfg->doctype = apr_pstrdup(cmd->pool, t) ;
1375 if ( l && ( ( l[0] == 'x' ) || ( l[0] == 'X' ) ) )
1376 cfg->etag = xhtml_etag ;
1378 cfg->etag = html_etag ;
1382 static const char* set_flags(cmd_parms* cmd, void* CFG, const char* arg) {
1383 proxy_html_conf* cfg = CFG;
1384 if ( arg && *arg ) {
1385 if ( !strcmp(arg, "lowercase") )
1386 cfg->flags |= NORM_LC ;
1387 else if ( !strcmp(arg, "dospath") )
1388 cfg->flags |= NORM_MSSLASH ;
1389 else if ( !strcmp(arg, "reset") )
1390 cfg->flags |= NORM_RESET ;
1394 static const char* set_events(cmd_parms* cmd, void* CFG, const char* arg) {
1396 proxy_html_conf* cfg = CFG;
1397 if (cfg->events == NULL)
1398 cfg->events = apr_array_make(cmd->pool, 20, sizeof(tattr));
1399 attr = apr_array_push(cfg->events) ;
1403 static const char* set_skipto(cmd_parms* cmd, void* CFG, const char* arg) {
1405 proxy_html_conf* cfg = CFG;
1406 if (cfg->skipto == NULL)
1407 cfg->skipto = apr_array_make(cmd->pool, 4, sizeof(tattr));
1408 attr = apr_array_push(cfg->skipto) ;
1412 static const char* set_links(cmd_parms* cmd, void* CFG,
1413 const char* elt, const char* att) {
1414 apr_array_header_t* attrs;
1416 proxy_html_conf* cfg = CFG;
1418 if (cfg->links == NULL)
1419 cfg->links = apr_hash_make(cmd->pool);
1421 attrs = apr_hash_get(cfg->links, elt, APR_HASH_KEY_STRING) ;
1423 attrs = apr_array_make(cmd->pool, 2, sizeof(tattr*)) ;
1424 apr_hash_set(cfg->links, elt, APR_HASH_KEY_STRING, attrs) ;
1426 attr = apr_array_push(attrs) ;
1430 static const char* set_charset_alias(cmd_parms* cmd, void* CFG,
1431 const char* charset, const char* alias) {
1432 const char* errmsg = ap_check_cmd_context(cmd, GLOBAL_ONLY);
1435 else if (xmlAddEncodingAlias(charset, alias) == 0)
1438 return "Error setting charset alias";
1440 static const char* set_charset_default(cmd_parms* cmd, void* CFG,
1441 const char* charset) {
1442 proxy_html_conf* cfg = CFG;
1443 cfg->default_encoding = xmlParseCharEncoding(charset);
1444 switch(cfg->default_encoding) {
1445 case XML_CHAR_ENCODING_NONE:
1446 return "Default charset not found";
1447 case XML_CHAR_ENCODING_ERROR:
1448 return "Invalid or unsupported default charset";
1453 static const command_rec proxy_html_cmds[] = {
1454 AP_INIT_ITERATE("ProxyHTMLStartParse", set_skipto, NULL,
1455 RSRC_CONF|ACCESS_CONF,
1456 "Ignore anything in front of the first of these elements"),
1457 AP_INIT_ITERATE("ProxyHTMLEvents", set_events, NULL,
1458 RSRC_CONF|ACCESS_CONF, "Strings to be treated as scripting events"),
1459 AP_INIT_ITERATE2("ProxyHTMLLinks", set_links, NULL,
1460 RSRC_CONF|ACCESS_CONF, "Declare HTML Attributes"),
1461 AP_INIT_RAW_ARGS("ProxyHTMLURLMap", set_urlmap, NULL,
1462 RSRC_CONF|ACCESS_CONF, "Map URL From To" ) ,
1463 AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1464 RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]" ) ,
1465 AP_INIT_ITERATE("ProxyHTMLFixups", set_flags, NULL,
1466 RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath" ) ,
1467 AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1468 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1469 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements" ) ,
1470 AP_INIT_FLAG("ProxyHTMLInterp", ap_set_flag_slot,
1471 (void*)APR_OFFSETOF(proxy_html_conf, interp),
1472 RSRC_CONF|ACCESS_CONF,
1473 "Support interpolation and conditions in URLMaps" ) ,
1474 AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1475 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1476 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS" ) ,
1477 AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1478 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1479 RSRC_CONF|ACCESS_CONF, "Strip out comments" ) ,
1481 AP_INIT_FLAG("ProxyHTMLLogVerbose", ap_set_flag_slot,
1482 (void*)APR_OFFSETOF(proxy_html_conf, verbose),
1483 RSRC_CONF|ACCESS_CONF, "Verbose Logging (use with LogLevel Info)" ) ,
1485 AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1486 (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1487 RSRC_CONF|ACCESS_CONF, "Buffer size" ) ,
1488 AP_INIT_ITERATE2("ProxyHTMLCharsetAlias", set_charset_alias, NULL,
1489 RSRC_CONF, "ProxyHTMLCharsetAlias charset alias [more aliases]" ) ,
1490 AP_INIT_TAKE1("ProxyHTMLCharsetDefault", set_charset_default, NULL,
1491 RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetDefault charset" ) ,
1492 AP_INIT_TAKE1("ProxyHTMLCharsetOut", ap_set_string_slot,
1493 (void*)APR_OFFSETOF(proxy_html_conf, charset_out),
1494 RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetOut charset" ) ,
1497 static int mod_proxy_html(apr_pool_t* p, apr_pool_t* p1, apr_pool_t* p2,
1499 ap_add_version_component(p, VERSION_STRING) ;
1500 seek_meta_ctype = ap_pregcomp(p,
1501 "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
1502 AP_REG_EXTENDED|AP_REG_ICASE) ;
1503 seek_charset = ap_pregcomp(p, "charset=([A-Za-z0-9_-]+)",
1504 AP_REG_EXTENDED|AP_REG_ICASE) ;
1505 seek_meta = ap_pregcomp(p, "<meta[^>]*(http-equiv)[^>]*>",
1506 AP_REG_EXTENDED|AP_REG_ICASE) ;
1507 memset(&sax, 0, sizeof(htmlSAXHandler));
1508 sax.startElement = pstartElement ;
1509 sax.endElement = pendElement ;
1510 sax.characters = pcharacters ;
1511 sax.comment = pcomment ;
1512 sax.cdataBlock = pcdata ;
1515 static void proxy_html_hooks(apr_pool_t* p) {
1516 ap_register_output_filter_protocol("proxy-html", proxy_html_filter,
1517 NULL, AP_FTYPE_RESOURCE,
1518 AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH) ;
1519 ap_hook_post_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE) ;
1521 module AP_MODULE_DECLARE_DATA proxy_html_module = {
1522 STANDARD20_MODULE_STUFF,