1 /********************************************************************
2 Copyright (c) 2003-7, WebThing Ltd
3 Author: Nick Kew <nick@webthing.com>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License Version 2,
7 as published by the Free Software Foundation.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You can obtain a copy of the GNU General Poblic License Version 2
15 from http://www.gnu.org/licenses/old-licenses/gpl-2.0.html or
16 http://apache.webthing.com/COPYING.txt
18 *********************************************************************/
21 /********************************************************************
24 You are requested to register as a user, at
25 http://apache.webthing.com/registration.html
27 This entitles you to support from the developer.
28 I'm unlikely to reply to help/support requests from
29 non-registered users, unless you're paying and/or offering
30 constructive feedback such as bug reports or sensible
31 suggestions for further development.
33 It also makes a small contribution to the effort
34 that's gone into developing this work.
35 *********************************************************************/
44 You can #define GO_FASTER to disable informational logging.
45 This disables the ProxyHTMLLogVerbose option altogether.
47 Default is to leave it undefined, and enable verbose logging
48 as a configuration option. Binaries are supplied with verbose
55 #define VERBOSE(x) if ( verbose ) x
58 #define VERSION_STRING "proxy_html/3.0.0"
63 #include <libxml/HTMLparser.h>
66 #include <http_protocol.h>
67 #include <http_config.h>
69 #include <apr_strings.h>
71 #include <apr_xlate.h>
73 /* To support Apache 2.1/2.2, we need the ap_ forms of the
74 * regexp stuff, and they're now used in the code.
75 * To support 2.0 in the same compile, * we #define the
76 * AP_ versions if necessary.
79 /* it's 2.0, so we #define the ap_ versions */
80 #define ap_regex_t regex_t
81 #define ap_regmatch_t regmatch_t
82 #define AP_REG_EXTENDED REG_EXTENDED
83 #define AP_REG_ICASE REG_ICASE
84 #define AP_REG_NOSUB REG_NOSUB
85 #define AP_REG_NEWLINE REG_NEWLINE
87 #define ap_register_output_filter_protocol(a,b,c,d,e) ap_register_output_filter(a,b,c,d)
92 module AP_MODULE_DECLARE_DATA proxy_html_module ;
98 #define M_ATSTART 0x10
101 #define M_NOTLAST 0x80
102 #define M_INTERPOLATE_TO 0x100
103 #define M_INTERPOLATE_FROM 0x200
117 typedef struct urlmap {
118 struct urlmap* next ;
120 unsigned int regflags ;
130 const char* doctype ;
135 apr_array_header_t* events;
136 apr_array_header_t* skipto;
137 xmlCharEncoding default_encoding;
138 const char* charset_out;
148 apr_xlate_t* convset;
154 proxy_html_conf* cfg ;
155 htmlParserCtxtPtr parser ;
156 apr_bucket_brigade* bb ;
162 const char* encoding;
168 #define NORM_MSSLASH 0x2
169 #define NORM_RESET 0x4
170 static htmlSAXHandler sax ;
172 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t ;
174 static const char* const fpi_html =
175 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n" ;
176 static const char* const fpi_html_legacy =
177 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" ;
178 static const char* const fpi_xhtml =
179 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ;
180 static const char* const fpi_xhtml_legacy =
181 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" ;
182 static const char* const html_etag = ">" ;
183 static const char* const xhtml_etag = " />" ;
184 /*#define DEFAULT_DOCTYPE fpi_html */
185 static const char* const DEFAULT_DOCTYPE = "" ;
186 #define DEFAULT_ETAG html_etag
188 static void normalise(unsigned int flags, char* str) {
190 if ( flags & NORM_LC )
191 for ( p = str ; *p ; ++p )
195 if ( flags & NORM_MSSLASH )
196 for ( p = ap_strchr_c(str, '\\') ; p ; p = ap_strchr_c(p+1, '\\') )
200 static void consume_buffer(saxctxt* ctx, const char* inbuf,
201 int bytes, int flag) {
206 int verbose = ctx->cfg->verbose;
208 if (ctx->conv_in == NULL) {
209 /* just feed it to libxml2 */
210 htmlParseChunk(ctx->parser, inbuf, bytes, flag) ;
213 if (ctx->conv_in->bytes > 0) {
214 /* FIXME: make this a reusable buf? */
215 buf = apr_palloc(ctx->f->r->pool, ctx->conv_in->bytes + bytes);
216 memcpy(buf, ctx->conv_in->buf, ctx->conv_in->bytes);
217 memcpy(buf + ctx->conv_in->bytes, inbuf, bytes);
218 bytes += ctx->conv_in->bytes;
219 ctx->conv_in->bytes = 0;
226 apr_size_t outsz = 4096;
227 rv = apr_xlate_conv_buffer(ctx->conv_in->convset,
228 buf + (bytes - insz), &insz,
230 htmlParseChunk(ctx->parser, outbuf, 4096-outsz, flag) ;
234 case APR_EINCOMPLETE: /* save dangling byte(s) and return */
235 ctx->conv_in->bytes = insz;
236 ctx->conv_in->buf = (buf != inbuf) ? buf + (bytes-insz)
237 : apr_pmemdup(ctx->f->r->pool, buf + (bytes-insz), insz);
239 case APR_EINVAL: /* try skipping one bad byte */
240 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, ctx->f->r,
241 "Skipping invalid byte in input stream!") ) ;
245 /* Erk! What's this? Bail out and eat the buf raw
246 * if libxml2 will accept it!
248 ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, ctx->f->r,
249 "Failed to convert input; trying it raw") ;
250 htmlParseChunk(ctx->parser, buf + (bytes - insz), insz, flag) ;
251 ctx->conv_in = NULL; /* don't try converting any more */
256 static void AP_fwrite(saxctxt* ctx, const char* inbuf, int bytes, int flush) {
257 /* convert charset if necessary, and output */
262 int verbose = ctx->cfg->verbose;
265 if (ctx->conv_out == NULL) {
266 ap_fwrite(ctx->f->next, ctx->bb, inbuf, bytes);
269 if (ctx->conv_out->bytes > 0) {
270 /* FIXME: make this a reusable buf? */
271 buf = apr_palloc(ctx->f->r->pool, ctx->conv_out->bytes + bytes);
272 memcpy(buf, ctx->conv_out->buf, ctx->conv_out->bytes);
273 memcpy(buf + ctx->conv_out->bytes, inbuf, bytes);
274 bytes += ctx->conv_out->bytes;
275 ctx->conv_out->bytes = 0;
282 apr_size_t outsz = 2048;
283 rv = apr_xlate_conv_buffer(ctx->conv_out->convset,
284 buf + (bytes - insz), &insz,
286 ap_fwrite(ctx->f->next, ctx->bb, outbuf, 2048-outsz) ;
290 case APR_EINCOMPLETE: /* save dangling byte(s) and return */
291 /* but if we need to flush, just abandon them */
292 if ( flush) { /* if we're flushing, this must be complete */
293 /* so this is an error */
294 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, ctx->f->r,
295 "Skipping invalid byte in output stream!") ) ;
297 ctx->conv_out->bytes = insz;
298 ctx->conv_out->buf = (buf != inbuf) ? buf + (bytes-insz)
299 : apr_pmemdup(ctx->f->r->pool, buf + (bytes-insz), insz);
302 case APR_EINVAL: /* try skipping one bad byte */
303 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, ctx->f->r,
304 "Skipping invalid byte in output stream!") ) ;
308 /* Erk! What's this? Bail out and pass the buf raw
309 * if libxml2 will accept it!
311 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, ctx->f->r,
312 "Failed to convert output; sending UTF-8") ) ;
313 ap_fwrite(ctx->f->next, ctx->bb, buf + (bytes - insz), insz) ;
319 /* This is always utf-8 on entry. We can convert charset within FLUSH */
320 #define FLUSH AP_fwrite(ctx, (chars+begin), (i-begin), 0) ; begin = i+1
321 static void pcharacters(void* ctxt, const xmlChar *uchars, int length) {
322 const char* chars = (const char*) uchars;
323 saxctxt* ctx = (saxctxt*) ctxt ;
326 for ( begin=i=0; i<length; i++ ) {
328 case '&' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "&") ; break ;
329 case '<' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "<") ; break ;
330 case '>' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, ">") ; break ;
331 case '"' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, """) ; break ;
337 static void preserve(saxctxt* ctx, const size_t len) {
339 if ( len <= ( ctx->avail - ctx->offset ) )
341 else while ( len > ( ctx->avail - ctx->offset ) )
342 ctx->avail += ctx->cfg->bufsz ;
344 newbuf = realloc(ctx->buf, ctx->avail) ;
345 if ( newbuf != ctx->buf ) {
347 apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf, (void*)free) ;
348 apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
349 (void*)free, apr_pool_cleanup_null);
353 static void pappend(saxctxt* ctx, const char* buf, const size_t len) {
355 memcpy(ctx->buf+ctx->offset, buf, len) ;
358 static void dump_content(saxctxt* ctx) {
361 size_t s_from, s_to ;
365 ap_regmatch_t pmatch[10] ;
368 urlmap* themap = ctx->map;
370 int verbose = ctx->cfg->verbose ;
373 pappend(ctx, &c, 1) ; /* append null byte */
374 /* parse the text for URLs */
375 for ( m = themap ; m ; m = m->next ) {
376 if ( ! ( m->flags & M_CDATA ) )
378 if ( m->flags & M_REGEX ) {
381 while ( ! ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0) ) {
382 match = pmatch[0].rm_so ;
383 s_from = pmatch[0].rm_eo - match ;
384 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
386 s_to = strlen(subs) ;
387 len = strlen(ctx->buf) ;
390 const char* f = apr_pstrndup(ctx->f->r->pool,
391 ctx->buf + offs , s_from ) ;
392 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
393 "C/RX: match at %s, substituting %s", f, subs) ;
395 if ( s_to > s_from) {
396 preserve(ctx, s_to - s_from) ;
397 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
398 len + 1 - s_from - offs) ;
399 memcpy(ctx->buf+offs, subs, s_to) ;
401 memcpy(ctx->buf + offs, subs, s_to) ;
402 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
403 len + 1 - s_from - offs) ;
408 s_from = strlen(m->from.c) ;
409 s_to = strlen(m->to) ;
410 for ( found = strstr(ctx->buf, m->from.c) ; found ;
411 found = strstr(ctx->buf+match+s_to, m->from.c) ) {
412 match = found - ctx->buf ;
413 if ( ( m->flags & M_ATSTART ) && ( match != 0) )
415 len = strlen(ctx->buf) ;
416 if ( ( m->flags & M_ATEND ) && ( match < (len - s_from) ) )
418 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
419 "C: matched %s, substituting %s", m->from.c, m->to) ) ;
420 if ( s_to > s_from ) {
421 preserve(ctx, s_to - s_from) ;
422 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
423 len + 1 - s_from - match) ;
424 memcpy(ctx->buf+match, m->to, s_to) ;
426 memcpy(ctx->buf+match, m->to, s_to) ;
427 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
428 len + 1 - s_from - match) ;
433 AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1) ;
435 static void pcdata(void* ctxt, const xmlChar *uchars, int length) {
436 const char* chars = (const char*) uchars;
437 saxctxt* ctx = (saxctxt*) ctxt ;
438 if ( ctx->cfg->extfix ) {
439 pappend(ctx, chars, length) ;
441 /* not sure if this should force-flush
442 * (i.e. can one cdata section come in multiple calls?)
444 AP_fwrite(ctx, chars, length, 0) ;
447 static void pcomment(void* ctxt, const xmlChar *uchars) {
448 const char* chars = (const char*) uchars;
449 saxctxt* ctx = (saxctxt*) ctxt ;
450 if ( ctx->cfg->strip_comments )
453 if ( ctx->cfg->extfix ) {
454 pappend(ctx, "<!--", 4) ;
455 pappend(ctx, chars, strlen(chars) ) ;
456 pappend(ctx, "-->", 3) ;
458 ap_fputs(ctx->f->next, ctx->bb, "<!--") ;
459 AP_fwrite(ctx, chars, strlen(chars), 1) ;
460 ap_fputs(ctx->f->next, ctx->bb, "-->") ;
463 static void pendElement(void* ctxt, const xmlChar* uname) {
464 saxctxt* ctx = (saxctxt*) ctxt ;
465 const char* name = (const char*) uname;
466 const htmlElemDesc* desc = htmlTagLookup(uname);
468 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
470 if (!desc || desc->depr)
473 } else if ((ctx->cfg->doctype == fpi_html)
474 || (ctx->cfg->doctype == fpi_xhtml)) {
475 /* enforce html legacy */
479 /* TODO - implement HTML "allowed here" using the stack */
480 /* nah. Keeping the stack is too much overhead */
482 if ( ctx->offset > 0 ) {
484 ctx->offset = 0 ; /* having dumped it, we can re-use the memory */
486 if ( !desc || ! desc->empty ) {
487 ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name) ;
490 static void pstartElement(void* ctxt, const xmlChar* uname,
491 const xmlChar** uattrs ) {
500 size_t s_to, s_from, match ;
502 saxctxt* ctx = (saxctxt*) ctxt ;
504 ap_regmatch_t pmatch[10] ;
506 int verbose = ctx->cfg->verbose ;
508 apr_array_header_t *linkattrs;
510 const char* name = (const char*) uname;
511 const char** attrs = (const char**) uattrs;
512 const htmlElemDesc* desc = htmlTagLookup(uname);
513 urlmap* themap = ctx->map;
518 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
521 if (!desc || desc->depr)
524 } else if ((ctx->cfg->doctype == fpi_html)
525 || (ctx->cfg->doctype == fpi_xhtml)) {
527 /* enforce html legacy */
532 if (!desc && enforce) {
533 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
534 "Bogus HTML element %s dropped", name) ;
537 if (desc && desc->depr && (enforce == 2) ) {
538 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
539 "Deprecated HTML element %s dropped", name) ;
543 descp = apr_array_push(ctx->stack);
545 /* TODO - implement HTML "allowed here" */
548 ap_fputc(ctx->f->next, ctx->bb, '<') ;
549 ap_fputs(ctx->f->next, ctx->bb, name) ;
552 if ((enforce > 0) && (desc != NULL) && (desc->attrs_req != NULL))
553 for (a = desc->attrs_req; *a; a++)
557 linkattrs = apr_hash_get(ctx->cfg->links, name, APR_HASH_KEY_STRING) ;
558 for ( a = attrs ; *a ; a += 2 ) {
559 if (desc && enforce > 0) {
560 switch (htmlAttrAllowed(desc, (xmlChar*)*a, 2-enforce)) {
562 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
563 "Bogus HTML attribute %s of %s dropped", *a, name);
565 case HTML_DEPRECATED:
566 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
567 "Deprecated HTML attribute %s of %s dropped", *a, name);
570 required_attrs--; /* cross off the number still needed */
571 /* fallthrough - required implies valid */
578 pappend(ctx, a[1], strlen(a[1])+1) ;
579 is_uri = ATTR_IGNORE ;
581 tattr* attrs = (tattr*) linkattrs->elts;
582 for (i=0; i < linkattrs->nelts; ++i) {
583 if ( !strcmp(*a, attrs[i].val)) {
589 if ( (is_uri == ATTR_IGNORE) && ctx->cfg->extfix
590 && (ctx->cfg->events != NULL) ) {
591 for (i=0; i < ctx->cfg->events->nelts; ++i) {
592 tattr* attrs = (tattr*) ctx->cfg->events->elts;
593 if ( !strcmp(*a, attrs[i].val)) {
594 is_uri = ATTR_EVENT ;
602 for ( m = themap ; m ; m = m->next ) {
603 if ( ! ( m->flags & M_HTML ) )
605 if ( m->flags & M_REGEX ) {
607 if ( ! ap_regexec(m->from.r, ctx->buf, nmatch, pmatch, 0) ) {
609 offs = match = pmatch[0].rm_so ;
610 s_from = pmatch[0].rm_eo - match ;
611 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
614 const char* f = apr_pstrndup(ctx->f->r->pool,
615 ctx->buf + offs , s_from ) ;
616 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
617 "H/RX: match at %s, substituting %s", f, subs) ;
619 s_to = strlen(subs) ;
620 len = strlen(ctx->buf) ;
621 if ( s_to > s_from) {
622 preserve(ctx, s_to - s_from) ;
623 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
624 len + 1 - s_from - offs) ;
625 memcpy(ctx->buf+offs, subs, s_to) ;
627 memcpy(ctx->buf + offs, subs, s_to) ;
628 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
629 len + 1 - s_from - offs) ;
633 s_from = strlen(m->from.c) ;
634 if ( ! strncasecmp(ctx->buf, m->from.c, s_from ) ) {
636 s_to = strlen(m->to) ;
637 len = strlen(ctx->buf) ;
638 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
639 "H: matched %s, substituting %s", m->from.c, m->to) ) ;
640 if ( s_to > s_from ) {
641 preserve(ctx, s_to - s_from) ;
642 memmove(ctx->buf+s_to, ctx->buf+s_from,
644 memcpy(ctx->buf, m->to, s_to) ;
645 } else { /* it fits in the existing space */
646 memcpy(ctx->buf, m->to, s_to) ;
647 memmove(ctx->buf+s_to, ctx->buf+s_from,
653 /* URIs only want one match unless overridden in the config */
654 if ( (num_match > 0) && !( m->flags & M_NOTLAST ) )
659 for ( m = themap ; m ; m = m->next ) {
660 num_match = 0 ; /* reset here since we're working per-rule */
661 if ( ! ( m->flags & M_EVENTS ) )
663 if ( m->flags & M_REGEX ) {
666 while ( ! ap_regexec(m->from.r, ctx->buf+offs,
667 nmatch, pmatch, 0) ) {
668 match = pmatch[0].rm_so ;
669 s_from = pmatch[0].rm_eo - match ;
670 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
673 const char* f = apr_pstrndup(ctx->f->r->pool,
674 ctx->buf + offs , s_from ) ;
675 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
676 "E/RX: match at %s, substituting %s", f, subs) ;
678 s_to = strlen(subs) ;
680 len = strlen(ctx->buf) ;
681 if ( s_to > s_from) {
682 preserve(ctx, s_to - s_from) ;
683 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
684 len + 1 - s_from - offs) ;
685 memcpy(ctx->buf+offs, subs, s_to) ;
687 memcpy(ctx->buf + offs, subs, s_to) ;
688 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
689 len + 1 - s_from - offs) ;
695 found = strstr(ctx->buf, m->from.c) ;
696 if ( (m->flags & M_ATSTART) && ( found != ctx->buf) )
699 s_from = strlen(m->from.c) ;
700 s_to = strlen(m->to) ;
701 match = found - ctx->buf ;
702 if ( ( s_from < strlen(found) ) && (m->flags & M_ATEND ) ) {
703 found = strstr(ctx->buf+match+s_from, m->from.c) ;
706 found = strstr(ctx->buf+match+s_to, m->from.c) ;
708 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
709 "E: matched %s, substituting %s", m->from.c, m->to) ) ;
710 len = strlen(ctx->buf) ;
711 if ( s_to > s_from ) {
712 preserve(ctx, s_to - s_from) ;
713 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
714 len + 1 - s_from - match) ;
715 memcpy(ctx->buf+match, m->to, s_to) ;
717 memcpy(ctx->buf+match, m->to, s_to) ;
718 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
719 len + 1 - s_from - match) ;
724 if ( num_match && ( m->flags & M_LAST ) )
733 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL) ;
736 if ( ctx->cfg->flags != 0 )
737 normalise(ctx->cfg->flags, ctx->buf) ;
739 /* write the attribute, using pcharacters to html-escape
740 anything that needs it in the value.
742 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL) ;
743 pcharacters(ctx, (const xmlChar*)ctx->buf, strlen(ctx->buf)) ;
744 ap_fputc(ctx->f->next, ctx->bb, '"') ;
749 if ( desc && desc->empty )
750 ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag) ;
752 ap_fputc(ctx->f->next, ctx->bb, '>') ;
754 if ((enforce > 0) && (required_attrs > 0)) {
755 /* if there are more required attributes than we found then complain */
756 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
757 "HTML element %s is missing %d required attributes",
758 name, required_attrs);
762 /* globals set once at startup */
763 static ap_regex_t* seek_meta_ctype ;
764 static ap_regex_t* seek_charset ;
765 static ap_regex_t* seek_meta ;
767 static xmlCharEncoding sniff_encoding(saxctxt* ctx, const char* cbuf,
770 int verbose = ctx->cfg->verbose;
772 request_rec* r = ctx->f->r ;
773 proxy_html_conf* cfg = ctx->cfg ;
774 xmlCharEncoding ret ;
776 ap_regmatch_t match[2] ;
777 char* buf = (char*)cbuf ;
778 apr_xlate_t* convset;
780 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
781 "Content-Type is %s", r->content_type) ) ;
783 /* If we've got it in the HTTP headers, there's nothing to do */
784 if ( r->content_type &&
785 ( p = ap_strcasestr(r->content_type, "charset=") , p > 0 ) ) {
787 if ( ctx->encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ) ,
789 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
790 "Got charset %s from HTTP headers", ctx->encoding) ) ;
791 if ( ret = xmlParseCharEncoding(ctx->encoding),
792 ((ret != XML_CHAR_ENCODING_ERROR )
793 && (ret != XML_CHAR_ENCODING_NONE))) {
799 /* to sniff, first we look for BOM */
800 if (ctx->encoding == NULL) {
801 if ( ret = xmlDetectCharEncoding((const xmlChar*)buf, bytes),
802 ret != XML_CHAR_ENCODING_NONE ) {
803 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
804 "Got charset from XML rules.") ) ;
808 /* If none of the above, look for a META-thingey */
809 if ( ap_regexec(seek_meta_ctype, buf, 1, match, 0) == 0 ) {
810 p = apr_pstrndup(r->pool, buf + match[0].rm_so,
811 match[0].rm_eo - match[0].rm_so) ;
812 if ( ap_regexec(seek_charset, p, 2, match, 0) == 0 )
813 ctx->encoding = apr_pstrndup(r->pool, p+match[1].rm_so,
814 match[1].rm_eo - match[1].rm_so) ;
818 /* either it's set to something we found or it's still the default */
819 if ( ctx->encoding ) {
820 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
821 "Got charset %s from HTML META", ctx->encoding) ) ;
822 if ( ret = xmlParseCharEncoding(ctx->encoding),
823 ((ret != XML_CHAR_ENCODING_ERROR )
824 && (ret != XML_CHAR_ENCODING_NONE))) {
827 /* Unsupported charset. Can we get (iconv) support through apr_xlate? */
828 /* Aaargh! libxml2 has undocumented <META-crap> support. So this fails
829 * if metafix is not active. Have to make it conditional.
832 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
833 "Charset %s not supported by libxml2; trying apr_xlate", ctx->encoding) ) ;
834 if (apr_xlate_open(&convset, "UTF-8", ctx->encoding, r->pool) == APR_SUCCESS) {
835 ctx->conv_in = apr_pcalloc(r->pool, sizeof(conv_t));
836 ctx->conv_in->convset = convset ;
837 return XML_CHAR_ENCODING_UTF8 ;
839 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
840 "Charset %s not supported. Consider aliasing it?", ctx->encoding) ;
843 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
844 "Charset %s not supported. Consider aliasing it or use metafix?",
850 /* Use configuration default as a last resort */
851 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r,
852 "No usable charset information; using configuration default") ;
853 return (cfg->default_encoding == XML_CHAR_ENCODING_NONE)
854 ? XML_CHAR_ENCODING_8859_1 : cfg->default_encoding ;
856 static meta* metafix(request_rec* r, const char* buf /*, size_t bytes*/
867 ap_regmatch_t pmatch[2] ;
870 while ( ! ap_regexec(seek_meta, buf+offs, 2, pmatch, 0) ) {
873 p = buf+offs+pmatch[1].rm_eo ;
874 while ( !isalpha(*++p) ) ;
875 for ( q = p ; isalnum(*q) || (*q == '-') ; ++q ) ;
876 header = apr_pstrndup(r->pool, p, q-p) ;
877 if ( strncasecmp(header, "Content-", 8) ) {
878 /* find content=... string */
879 for ( p = ap_strstr((char*)buf+offs+pmatch[0].rm_so, "content") ; *p ; ) {
881 while ( *p && isspace(*p) )
885 while ( *p && isspace(*++p) ) ;
886 if ( ( *p == '\'' ) || ( *p == '"' ) ) {
888 for ( q = p ; *q != delim ; ++q ) ;
890 for ( q = p ; *q && !isspace(*q) && (*q != '>') ; ++q ) ;
892 content = apr_pstrndup(r->pool, p, q-p) ;
895 } else if ( !strncasecmp(header, "Content-Type", 12) ) {
896 ret = apr_palloc(r->pool, sizeof(meta) ) ;
897 ret->start = pmatch[0].rm_so ;
898 ret->end = pmatch[0].rm_eo ;
900 if ( header && content ) {
901 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
902 "Adding header [%s: %s] from HTML META", header, content) ) ;
903 apr_table_setn(r->headers_out, header, content) ;
905 offs += pmatch[0].rm_eo ;
910 static const char* interpolate_vars(request_rec* r, const char* str) {
916 const char* replacement;
920 if (start = ap_strstr_c(start, "${"), start == NULL)
923 if (end = ap_strchr_c(start+2, '}'), end == NULL)
926 delim = ap_strchr_c(start, '|');
927 before = apr_pstrndup(r->pool, str, start-str);
930 var = apr_pstrndup(r->pool, start+2, delim-start-2) ;
932 var = apr_pstrndup(r->pool, start+2, end-start-2) ;
934 replacement = apr_table_get(r->subprocess_env, var) ;
937 replacement = apr_pstrndup(r->pool, delim+1, end-delim-1);
940 str = apr_pstrcat(r->pool, before, replacement, after, NULL);
941 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
942 "Interpolating %s => %s", var, replacement) ;
946 static void fixup_rules(saxctxt* ctx) {
951 request_rec* r = ctx->f->r;
954 for (p = ctx->cfg->map; p; p = p->next) {
956 if (p->cond != NULL) {
957 thisval = apr_table_get(r->subprocess_env, p->cond->env);
959 /* required to be "anything" */
961 has_cond = 1; /* satisfied */
963 has_cond = 0; /* unsatisfied */
965 if (thisval && !strcasecmp(p->cond->val, thisval)) {
966 has_cond = 1; /* satisfied */
968 has_cond = 0; /* unsatisfied */
971 if (((has_cond == 0) && (p->cond->rel ==1 ))
972 || ((has_cond == 1) && (p->cond->rel == -1))) {
973 continue; /* condition is unsatisfied */
977 newp = apr_pmemdup(r->pool, p, sizeof(urlmap));
979 if (newp->flags & M_INTERPOLATE_FROM) {
980 newp->from.c = interpolate_vars(r, newp->from.c);
981 if (!newp->from.c || !*newp->from.c)
982 continue; /* don't use empty from-pattern */
983 if (newp->flags & M_REGEX) {
984 newp->from.r = ap_pregcomp(r->pool, newp->from.c, newp->regflags) ;
987 if (newp->flags & M_INTERPOLATE_TO) {
988 newp->to = interpolate_vars(r, newp->to);
990 /* evaluate p->cond; continue if unsatisfied */
991 /* create new urlmap with memcpy and append to map */
992 /* interpolate from if flagged to do so */
993 /* interpolate to if flagged to do so */
1005 static saxctxt* check_filter_init (ap_filter_t* f) {
1007 proxy_html_conf* cfg
1008 = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
1009 const char* force = apr_table_get(f->r->subprocess_env, "PROXY_HTML_FORCE");
1011 const char* errmsg = NULL ;
1013 if ( ! f->r->proxyreq ) {
1014 errmsg = "Non-proxy request; not inserting proxy-html filter" ;
1015 } else if ( ! f->r->content_type ) {
1016 errmsg = "No content-type; bailing out of proxy-html filter" ;
1017 } else if ( strncasecmp(f->r->content_type, "text/html", 9) &&
1018 strncasecmp(f->r->content_type, "application/xhtml+xml", 21) ) {
1019 errmsg = "Non-HTML content; not inserting proxy-html filter" ;
1025 if ( cfg->verbose ) {
1026 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, f->r, errmsg) ;
1029 ap_remove_output_filter(f) ;
1034 fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt)) ;
1036 fctx->bb = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc) ;
1038 apr_table_unset(f->r->headers_out, "Content-Length") ;
1043 fctx->map = cfg->map;
1044 /* defer dealing with charset_out until after sniffing charset_in
1045 * so we can support setting one to t'other.
1050 static int proxy_html_filter(ap_filter_t* f, apr_bucket_brigade* bb) {
1051 apr_xlate_t* convset;
1052 const char* charset = NULL;
1055 xmlCharEncoding enc ;
1056 const char* buf = 0 ;
1057 apr_size_t bytes = 0 ;
1058 #ifndef USE_OLD_LIBXML2
1059 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
1060 XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING ;
1063 saxctxt* ctxt = check_filter_init(f) ;
1068 return ap_pass_brigade(f->next, bb) ;
1070 verbose = ctxt->cfg->verbose;
1073 for ( b = APR_BRIGADE_FIRST(bb) ;
1074 b != APR_BRIGADE_SENTINEL(bb) ;
1075 b = APR_BUCKET_NEXT(b) ) {
1076 if ( APR_BUCKET_IS_METADATA(b) ) {
1077 if ( APR_BUCKET_IS_EOS(b) ) {
1078 if ( ctxt->parser != NULL ) {
1079 consume_buffer(ctxt, buf, 0, 1);
1081 APR_BRIGADE_INSERT_TAIL(ctxt->bb,
1082 apr_bucket_eos_create(ctxt->bb->bucket_alloc) ) ;
1083 ap_pass_brigade(ctxt->f->next, ctxt->bb) ;
1084 } else if ( APR_BUCKET_IS_FLUSH(b) ) {
1085 /* pass on flush, except at start where it would cause
1086 * headers to be sent before doc sniffing
1088 if ( ctxt->parser != NULL ) {
1089 ap_fflush(ctxt->f->next, ctxt->bb) ;
1092 } else if ( apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
1094 if ( ctxt->parser == NULL ) {
1095 if ( buf[bytes] != 0 ) {
1096 /* make a string for parse routines to play with */
1097 char* buf1 = apr_palloc(f->r->pool, bytes+1) ;
1098 memcpy(buf1, buf, bytes) ;
1102 /* For publishing systems that insert crap at the head of a
1103 * page that buggers up the parser. Search to first instance
1104 * of some relatively sane, or at least parseable, element.
1106 if (ctxt->cfg->skipto != NULL) {
1107 char* p = ap_strchr_c(buf, '<');
1108 tattr* starts = (tattr*) ctxt->cfg->skipto->elts;
1110 while (!found && *p) {
1112 for (i = 0; i < ctxt->cfg->skipto->nelts; ++i) {
1113 if ( !strncasecmp(p+1, starts[i].val, strlen(starts[i].val))) {
1118 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
1119 "Skipped to first <%s> element", starts[i].val)
1124 p = ap_strchr_c(p+1, '<');
1127 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
1128 "Failed to find start of recognised HTML!") ;
1132 enc = sniff_encoding(ctxt, buf, bytes) ;
1133 /* now we have input charset, set output charset too */
1134 if (ctxt->cfg->charset_out) {
1135 if (!strcmp(ctxt->cfg->charset_out, "*"))
1136 charset = ctxt->encoding;
1138 charset = ctxt->cfg->charset_out;
1139 if (strcasecmp(charset, "utf-8")) {
1140 if (apr_xlate_open(&convset, charset, "UTF-8",
1141 f->r->pool) == APR_SUCCESS) {
1142 ctxt->conv_out = apr_pcalloc(f->r->pool, sizeof(conv_t));
1143 ctxt->conv_out->convset = convset;
1145 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
1146 "Output charset %s not supported. Falling back to UTF-8",
1151 if (ctxt->conv_out) {
1152 const char* ctype = apr_psprintf(f->r->pool,
1153 "text/html;charset=%s", charset);
1154 ap_set_content_type(f->r, ctype) ;
1156 ap_set_content_type(f->r, "text/html;charset=utf-8") ;
1158 ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype) ;
1159 ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, buf, 4, 0, enc) ;
1162 if (ctxt->parser == NULL) {
1163 apr_status_t rv = ap_pass_brigade(f->next, bb) ;
1164 ap_remove_output_filter(f) ;
1167 apr_pool_cleanup_register(f->r->pool, ctxt->parser,
1168 (void*)htmlFreeParserCtxt, apr_pool_cleanup_null) ;
1169 #ifndef USE_OLD_LIBXML2
1170 if ( xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts ), xmlopts )
1171 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
1172 "Unsupported parser opts %x", xmlopts) ;
1174 if ( ctxt->cfg->metafix )
1176 m = metafix(f->r, buf, ctxt->cfg->verbose) ;
1178 m = metafix(f->r, buf) ;
1181 consume_buffer(ctxt, buf, m->start, 0) ;
1182 consume_buffer(ctxt, buf+m->end, bytes-m->end, 0) ;
1184 consume_buffer(ctxt, buf, bytes, 0) ;
1187 consume_buffer(ctxt, buf, bytes, 0) ;
1190 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, "Error in bucket read") ;
1193 /*ap_fflush(ctxt->f->next, ctxt->bb) ; // uncomment for debug */
1194 apr_brigade_cleanup(bb) ;
1195 return APR_SUCCESS ;
1198 static void* proxy_html_config(apr_pool_t* pool, char* x) {
1199 proxy_html_conf* ret = apr_pcalloc(pool, sizeof(proxy_html_conf) ) ;
1200 ret->doctype = DEFAULT_DOCTYPE ;
1201 ret->etag = DEFAULT_ETAG ;
1203 ret->default_encoding = XML_CHAR_ENCODING_NONE ;
1204 /* ret->interp = 1; */
1205 /* don't initialise links and events until they get set/used */
1208 static void* proxy_html_merge(apr_pool_t* pool, void* BASE, void* ADD) {
1209 proxy_html_conf* base = (proxy_html_conf*) BASE ;
1210 proxy_html_conf* add = (proxy_html_conf*) ADD ;
1211 proxy_html_conf* conf = apr_palloc(pool, sizeof(proxy_html_conf)) ;
1213 /* don't merge declarations - just use the most specific */
1214 conf->links = (add->links == NULL) ? base->links : add->links;
1215 conf->events = (add->events == NULL) ? base->events : add->events;
1217 conf->default_encoding = (add->default_encoding == XML_CHAR_ENCODING_NONE)
1218 ? base->default_encoding : add->default_encoding ;
1219 conf->charset_out = (add->charset_out == NULL)
1220 ? base->charset_out : add->charset_out ;
1222 if ( add->map && base->map ) {
1225 for ( a = base->map ; a ; a = a->next ) {
1226 urlmap* save = conf->map ;
1227 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
1228 conf->map->next = save ;
1230 for ( a = add->map ; a ; a = a->next ) {
1231 urlmap* save = conf->map ;
1232 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
1233 conf->map->next = save ;
1236 conf->map = add->map ? add->map : base->map ;
1238 conf->doctype = ( add->doctype == DEFAULT_DOCTYPE )
1239 ? base->doctype : add->doctype ;
1240 conf->etag = ( add->etag == DEFAULT_ETAG ) ? base->etag : add->etag ;
1241 conf->bufsz = add->bufsz ;
1242 if ( add->flags & NORM_RESET ) {
1243 conf->flags = add->flags ^ NORM_RESET ;
1244 conf->metafix = add->metafix ;
1245 conf->extfix = add->extfix ;
1246 conf->interp = add->interp ;
1247 conf->strip_comments = add->strip_comments ;
1248 conf->skipto = add->skipto ;
1250 conf->verbose = add->verbose ;
1253 conf->flags = base->flags | add->flags ;
1254 conf->metafix = base->metafix | add->metafix ;
1255 conf->extfix = base->extfix | add->extfix ;
1256 conf->interp = base->interp | add->interp ;
1257 conf->strip_comments = base->strip_comments | add->strip_comments ;
1258 conf->skipto = add->skipto ? add->skipto : base->skipto ;
1260 conf->verbose = base->verbose | add->verbose ;
1265 #define REGFLAG(n,s,c) ( (s&&(ap_strchr_c((s),(c))!=NULL)) ? (n) : 0 )
1266 #define XREGFLAG(n,s,c) ( (!s||(ap_strchr_c((s),(c))==NULL)) ? (n) : 0 )
1267 static void comp_urlmap(apr_pool_t* pool, urlmap* newmap,
1268 const char* from, const char* to, const char* flags, const char* cond) {
1271 = XREGFLAG(M_HTML,flags,'h')
1272 | XREGFLAG(M_EVENTS,flags,'e')
1273 | XREGFLAG(M_CDATA,flags,'c')
1274 | REGFLAG(M_ATSTART,flags,'^')
1275 | REGFLAG(M_ATEND,flags,'$')
1276 | REGFLAG(M_REGEX,flags,'R')
1277 | REGFLAG(M_LAST,flags,'L')
1278 | REGFLAG(M_NOTLAST,flags,'l')
1279 | REGFLAG(M_INTERPOLATE_TO,flags,'V')
1280 | REGFLAG(M_INTERPOLATE_FROM,flags,'v')
1282 if ( ( newmap->flags & M_INTERPOLATE_FROM)
1283 || ! (newmap->flags & M_REGEX) ) {
1284 newmap->from.c = from ;
1288 = REGFLAG(AP_REG_EXTENDED,flags,'x')
1289 | REGFLAG(AP_REG_ICASE,flags,'i')
1290 | REGFLAG(AP_REG_NOSUB,flags,'n')
1291 | REGFLAG(AP_REG_NEWLINE,flags,'s')
1293 newmap->from.r = ap_pregcomp(pool, from, newmap->regflags) ;
1297 newmap->cond = apr_pcalloc(pool, sizeof(rewritecond));
1298 if (cond[0] == '!') {
1299 newmap->cond->rel = -1;
1300 newmap->cond->env = cond+1;
1302 newmap->cond->rel = 1;
1303 newmap->cond->env = cond;
1305 eq = ap_strchr_c(++cond, '=');
1306 if (eq && (eq != cond)) {
1308 newmap->cond->val = eq+1;
1311 newmap->cond = NULL;
1314 static const char* set_urlmap(cmd_parms* cmd, void* CFG, const char* args) {
1315 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
1317 apr_pool_t* pool = cmd->pool;
1320 "Usage: ProxyHTMLURLMap from-pattern to-pattern [flags] [cond]";
1324 const char* cond = NULL;
1326 if (from = ap_getword_conf(cmd->pool, &args), !from)
1328 if (to = ap_getword_conf(cmd->pool, &args), !to)
1330 flags = ap_getword_conf(cmd->pool, &args);
1331 if (flags && *flags)
1332 cond = ap_getword_conf(cmd->pool, &args);
1336 /* the args look OK, so let's use them */
1337 newmap = apr_palloc(pool, sizeof(urlmap) ) ;
1338 newmap->next = NULL;
1340 for ( map = cfg->map ; map->next ; map = map->next ) ;
1341 map->next = newmap ;
1345 comp_urlmap(cmd->pool, newmap, from, to, flags, cond);
1349 static const char* set_doctype(cmd_parms* cmd, void* CFG, const char* t,
1351 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
1352 if ( !strcasecmp(t, "xhtml") ) {
1353 cfg->etag = xhtml_etag ;
1354 if ( l && !strcasecmp(l, "legacy") )
1355 cfg->doctype = fpi_xhtml_legacy ;
1357 cfg->doctype = fpi_xhtml ;
1358 } else if ( !strcasecmp(t, "html") ) {
1359 cfg->etag = html_etag ;
1360 if ( l && !strcasecmp(l, "legacy") )
1361 cfg->doctype = fpi_html_legacy ;
1363 cfg->doctype = fpi_html ;
1365 cfg->doctype = apr_pstrdup(cmd->pool, t) ;
1366 if ( l && ( ( l[0] == 'x' ) || ( l[0] == 'X' ) ) )
1367 cfg->etag = xhtml_etag ;
1369 cfg->etag = html_etag ;
1373 static const char* set_flags(cmd_parms* cmd, void* CFG, const char* arg) {
1374 proxy_html_conf* cfg = CFG;
1375 if ( arg && *arg ) {
1376 if ( !strcmp(arg, "lowercase") )
1377 cfg->flags |= NORM_LC ;
1378 else if ( !strcmp(arg, "dospath") )
1379 cfg->flags |= NORM_MSSLASH ;
1380 else if ( !strcmp(arg, "reset") )
1381 cfg->flags |= NORM_RESET ;
1385 static const char* set_events(cmd_parms* cmd, void* CFG, const char* arg) {
1387 proxy_html_conf* cfg = CFG;
1388 if (cfg->events == NULL)
1389 cfg->events = apr_array_make(cmd->pool, 20, sizeof(tattr));
1390 attr = apr_array_push(cfg->events) ;
1394 static const char* set_skipto(cmd_parms* cmd, void* CFG, const char* arg) {
1396 proxy_html_conf* cfg = CFG;
1397 if (cfg->skipto == NULL)
1398 cfg->skipto = apr_array_make(cmd->pool, 4, sizeof(tattr));
1399 attr = apr_array_push(cfg->skipto) ;
1403 static const char* set_links(cmd_parms* cmd, void* CFG,
1404 const char* elt, const char* att) {
1405 apr_array_header_t* attrs;
1407 proxy_html_conf* cfg = CFG;
1409 if (cfg->links == NULL)
1410 cfg->links = apr_hash_make(cmd->pool);
1412 attrs = apr_hash_get(cfg->links, elt, APR_HASH_KEY_STRING) ;
1414 attrs = apr_array_make(cmd->pool, 2, sizeof(tattr*)) ;
1415 apr_hash_set(cfg->links, elt, APR_HASH_KEY_STRING, attrs) ;
1417 attr = apr_array_push(attrs) ;
1421 static const char* set_charset_alias(cmd_parms* cmd, void* CFG,
1422 const char* charset, const char* alias) {
1423 const char* errmsg = ap_check_cmd_context(cmd, GLOBAL_ONLY);
1426 else if (xmlAddEncodingAlias(charset, alias) == 0)
1429 return "Error setting charset alias";
1431 static const char* set_charset_default(cmd_parms* cmd, void* CFG,
1432 const char* charset) {
1433 proxy_html_conf* cfg = CFG;
1434 cfg->default_encoding = xmlParseCharEncoding(charset);
1435 switch(cfg->default_encoding) {
1436 case XML_CHAR_ENCODING_NONE:
1437 return "Default charset not found";
1438 case XML_CHAR_ENCODING_ERROR:
1439 return "Invalid or unsupported default charset";
1444 static const command_rec proxy_html_cmds[] = {
1445 AP_INIT_ITERATE("ProxyHTMLStartParse", set_skipto, NULL,
1446 RSRC_CONF|ACCESS_CONF,
1447 "Ignore anything in front of the first of these elements"),
1448 AP_INIT_ITERATE("ProxyHTMLEvents", set_events, NULL,
1449 RSRC_CONF|ACCESS_CONF, "Strings to be treated as scripting events"),
1450 AP_INIT_ITERATE2("ProxyHTMLLinks", set_links, NULL,
1451 RSRC_CONF|ACCESS_CONF, "Declare HTML Attributes"),
1452 AP_INIT_RAW_ARGS("ProxyHTMLURLMap", set_urlmap, NULL,
1453 RSRC_CONF|ACCESS_CONF, "Map URL From To" ) ,
1454 AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1455 RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]" ) ,
1456 AP_INIT_ITERATE("ProxyHTMLFixups", set_flags, NULL,
1457 RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath" ) ,
1458 AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1459 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1460 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements" ) ,
1461 AP_INIT_FLAG("ProxyHTMLInterp", ap_set_flag_slot,
1462 (void*)APR_OFFSETOF(proxy_html_conf, interp),
1463 RSRC_CONF|ACCESS_CONF,
1464 "Support interpolation and conditions in URLMaps" ) ,
1465 AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1466 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1467 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS" ) ,
1468 AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1469 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1470 RSRC_CONF|ACCESS_CONF, "Strip out comments" ) ,
1472 AP_INIT_FLAG("ProxyHTMLLogVerbose", ap_set_flag_slot,
1473 (void*)APR_OFFSETOF(proxy_html_conf, verbose),
1474 RSRC_CONF|ACCESS_CONF, "Verbose Logging (use with LogLevel Info)" ) ,
1476 AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1477 (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1478 RSRC_CONF|ACCESS_CONF, "Buffer size" ) ,
1479 AP_INIT_ITERATE2("ProxyHTMLCharsetAlias", set_charset_alias, NULL,
1480 RSRC_CONF, "ProxyHTMLCharsetAlias charset alias [more aliases]" ) ,
1481 AP_INIT_TAKE1("ProxyHTMLCharsetDefault", set_charset_default, NULL,
1482 RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetDefault charset" ) ,
1483 AP_INIT_TAKE1("ProxyHTMLCharsetOut", ap_set_string_slot,
1484 (void*)APR_OFFSETOF(proxy_html_conf, charset_out),
1485 RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetOut charset" ) ,
1488 static int mod_proxy_html(apr_pool_t* p, apr_pool_t* p1, apr_pool_t* p2,
1490 ap_add_version_component(p, VERSION_STRING) ;
1491 seek_meta_ctype = ap_pregcomp(p,
1492 "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
1493 AP_REG_EXTENDED|AP_REG_ICASE) ;
1494 seek_charset = ap_pregcomp(p, "charset=([A-Za-z0-9_-]+)",
1495 AP_REG_EXTENDED|AP_REG_ICASE) ;
1496 seek_meta = ap_pregcomp(p, "<meta[^>]*(http-equiv)[^>]*>",
1497 AP_REG_EXTENDED|AP_REG_ICASE) ;
1498 memset(&sax, 0, sizeof(htmlSAXHandler));
1499 sax.startElement = pstartElement ;
1500 sax.endElement = pendElement ;
1501 sax.characters = pcharacters ;
1502 sax.comment = pcomment ;
1503 sax.cdataBlock = pcdata ;
1506 static void proxy_html_hooks(apr_pool_t* p) {
1507 ap_register_output_filter_protocol("proxy-html", proxy_html_filter,
1508 NULL, AP_FTYPE_RESOURCE,
1509 AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH) ;
1510 ap_hook_post_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE) ;
1512 module AP_MODULE_DECLARE_DATA proxy_html_module = {
1513 STANDARD20_MODULE_STUFF,