1 /********************************************************************
2 Copyright (c) 2003-4, WebThing Ltd
3 Author: Nick Kew <nick@webthing.com>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *********************************************************************/
22 /********************************************************************
25 You are requested to register as a user, at
26 http://apache.webthing.com/registration.html
28 This entitles you to support from the developer.
29 I'm unlikely to reply to help/support requests from
30 non-registered users, unless you're paying and/or offering
31 constructive feedback such as bug reports or sensible
32 suggestions for further development.
34 It also makes a small contribution to the effort
35 that's gone into developing this work.
36 *********************************************************************/
45 You can #define GO_FASTER to disable informational logging.
46 This disables the ProxyHTMLLogVerbose option altogether.
48 Default is to leave it undefined, and enable verbose logging
49 as a configuration option. Binaries are supplied with verbose
56 #define VERBOSE(x) if ( verbose ) x
59 #define VERSION_STRING "proxy_html/2.4"
64 #include <libxml/HTMLparser.h>
67 #include <http_protocol.h>
68 #include <http_config.h>
70 #include <apr_strings.h>
72 module AP_MODULE_DECLARE_DATA proxy_html_module ;
78 #define M_ATSTART 0x10
86 typedef struct urlmap {
109 htmlSAXHandlerPtr sax ;
111 proxy_html_conf* cfg ;
112 htmlParserCtxtPtr parser ;
113 apr_bucket_brigade* bb ;
119 static int is_empty_elt(const char* name) {
121 static const char* empty_elts[] = {
137 for ( p = empty_elts ; *p ; ++p )
138 if ( !strcmp( *p, name) )
149 #define NORM_MSSLASH 0x2
150 #define NORM_RESET 0x4
152 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t ;
154 static void normalise(unsigned int flags, char* str) {
156 if ( flags & NORM_LC )
157 for ( p = str ; *p ; ++p )
161 if ( flags & NORM_MSSLASH )
162 for ( p = strchr(str, '\\') ; p ; p = strchr(p+1, '\\') )
167 #define FLUSH ap_fwrite(ctx->f->next, ctx->bb, (chars+begin), (i-begin)) ; begin = i+1
168 static void pcharacters(void* ctxt, const xmlChar *chars, int length) {
169 saxctxt* ctx = (saxctxt*) ctxt ;
172 for ( begin=i=0; i<length; i++ ) {
174 case '&' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "&") ; break ;
175 case '<' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "<") ; break ;
176 case '>' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, ">") ; break ;
177 case '"' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, """) ; break ;
183 static void preserve(saxctxt* ctx, const size_t len) {
185 if ( len <= ( ctx->avail - ctx->offset ) )
187 else while ( len > ( ctx->avail - ctx->offset ) )
188 ctx->avail += ctx->cfg->bufsz ;
190 newbuf = realloc(ctx->buf, ctx->avail) ;
191 if ( newbuf != ctx->buf ) {
193 apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf, (void*)free) ;
194 apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
195 (void*)free, apr_pool_cleanup_null);
199 static void pappend(saxctxt* ctx, const char* buf, const size_t len) {
201 memcpy(ctx->buf+ctx->offset, buf, len) ;
204 static void dump_content(saxctxt* ctx) {
207 size_t s_from, s_to ;
211 ap_regmatch_t pmatch[10] ;
215 int verbose = ctx->cfg->verbose ;
218 pappend(ctx, &c, 1) ; /* append null byte */
219 /* parse the text for URLs */
220 for ( m = ctx->cfg->map ; m ; m = m->next ) {
221 if ( ! ( m->flags & M_CDATA ) )
223 if ( m->flags & M_REGEX ) {
226 while ( ! ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0) ) {
227 match = pmatch[0].rm_so ;
228 s_from = pmatch[0].rm_eo - match ;
229 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
231 s_to = strlen(subs) ;
232 len = strlen(ctx->buf) ;
235 const char* f = apr_pstrndup(ctx->f->r->pool,
236 ctx->buf + offs , s_from ) ;
237 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
238 "C/RX: match at %s, substituting %s", f, subs) ;
240 if ( s_to > s_from) {
241 preserve(ctx, s_to - s_from) ;
242 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
243 len + 1 - s_from - offs) ;
244 memcpy(ctx->buf+offs, subs, s_to) ;
246 memcpy(ctx->buf + offs, subs, s_to) ;
247 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
248 len + 1 - s_from - offs) ;
253 s_from = strlen(m->from.c) ;
254 s_to = strlen(m->to) ;
255 for ( found = strstr(ctx->buf, m->from.c) ; found ;
256 found = strstr(ctx->buf+match+s_to, m->from.c) ) {
257 match = found - ctx->buf ;
258 if ( ( m->flags & M_ATSTART ) && ( match != 0) )
260 len = strlen(ctx->buf) ;
261 if ( ( m->flags & M_ATEND ) && ( match < (len - s_from) ) )
263 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
264 "C: matched %s, substituting %s", m->from.c, m->to) ) ;
265 if ( s_to > s_from ) {
266 preserve(ctx, s_to - s_from) ;
267 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
268 len + 1 - s_from - match) ;
269 memcpy(ctx->buf+match, m->to, s_to) ;
271 memcpy(ctx->buf+match, m->to, s_to) ;
272 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
273 len + 1 - s_from - match) ;
278 ap_fputs(ctx->f->next, ctx->bb, ctx->buf) ;
280 static void pcdata(void* ctxt, const xmlChar *chars, int length) {
281 saxctxt* ctx = (saxctxt*) ctxt ;
282 if ( ctx->cfg->extfix ) {
283 pappend(ctx, chars, length) ;
285 ap_fwrite(ctx->f->next, ctx->bb, chars, length) ;
288 static void pcomment(void* ctxt, const xmlChar *chars) {
289 saxctxt* ctx = (saxctxt*) ctxt ;
290 if ( ctx->cfg->strip_comments )
293 if ( ctx->cfg->extfix ) {
294 pappend(ctx, "<!--", 4) ;
295 pappend(ctx, chars, strlen(chars) ) ;
296 pappend(ctx, "-->", 3) ;
298 ap_fputstrs(ctx->f->next, ctx->bb, "<!--", chars, "-->", NULL) ;
301 static void pendElement(void* ctxt, const xmlChar* name) {
302 saxctxt* ctx = (saxctxt*) ctxt ;
303 if ( ctx->offset > 0 ) {
305 ctx->offset = 0 ; /* having dumped it, we can re-use the memory */
307 if ( ! is_empty_elt(name) )
308 ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name) ;
310 static void pstartElement(void* ctxt, const xmlChar* name,
311 const xmlChar** attrs ) {
317 const char** linkattrs ;
320 const char** linkattr ;
322 size_t s_to, s_from, match ;
324 saxctxt* ctx = (saxctxt*) ctxt ;
326 ap_regmatch_t pmatch[10] ;
328 int verbose = ctx->cfg->verbose ;
331 static const char* href[] = { "href", NULL } ;
332 static const char* cite[] = { "cite", NULL } ;
333 static const char* action[] = { "action", NULL } ;
334 static const char* imgattr[] = { "src", "longdesc", "usemap", NULL } ;
335 static const char* inputattr[] = { "src", "usemap", NULL } ;
336 static const char* scriptattr[] = { "src", "for", NULL } ;
337 static const char* frameattr[] = { "src", "longdesc", NULL } ;
338 static const char* objattr[] = { "classid", "codebase", "data", "usemap", NULL } ;
339 static const char* profile[] = { "profile", NULL } ;
340 static const char* background[] = { "background", NULL } ;
341 static const char* codebase[] = { "codebase", NULL } ;
343 static const elt_t linked_elts[] = {
345 { "img" , imgattr } ,
348 { "script" , scriptattr } ,
351 { "input" , inputattr } ,
352 { "frame", frameattr } ,
353 { "iframe", frameattr } ,
354 { "object", objattr } ,
356 { "blockquote" , cite } ,
359 { "head" , profile } ,
360 { "body" , background } ,
361 { "applet", codebase } ,
364 static const char* events[] = {
386 ap_fputc(ctx->f->next, ctx->bb, '<') ;
387 ap_fputs(ctx->f->next, ctx->bb, name) ;
391 for ( elt = linked_elts; elt->name != NULL ; ++elt )
392 if ( !strcmp(elt->name, name) ) {
393 linkattrs = elt->attrs ;
396 for ( a = attrs ; *a ; a += 2 ) {
399 pappend(ctx, a[1], strlen(a[1])+1) ;
400 is_uri = ATTR_IGNORE ;
402 for ( linkattr = linkattrs ; *linkattr ; ++linkattr) {
403 if ( !strcmp(*linkattr, *a) ) {
409 if ( (is_uri == ATTR_IGNORE) && ctx->cfg->extfix ) {
410 for ( linkattr = events; *linkattr; ++linkattr ) {
411 if ( !strcmp(*linkattr, *a) ) {
412 is_uri = ATTR_EVENT ;
420 for ( m = ctx->cfg->map ; m ; m = m->next ) {
421 if ( ! ( m->flags & M_HTML ) )
423 if ( m->flags & M_REGEX ) {
425 if ( ! ap_regexec(m->from.r, ctx->buf, nmatch, pmatch, 0) ) {
427 offs = match = pmatch[0].rm_so ;
428 s_from = pmatch[0].rm_eo - match ;
429 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
432 const char* f = apr_pstrndup(ctx->f->r->pool,
433 ctx->buf + offs , s_from ) ;
434 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
435 "H/RX: match at %s, substituting %s", f, subs) ;
437 s_to = strlen(subs) ;
438 len = strlen(ctx->buf) ;
439 if ( s_to > s_from) {
440 preserve(ctx, s_to - s_from) ;
441 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
442 len + 1 - s_from - offs) ;
443 memcpy(ctx->buf+offs, subs, s_to) ;
445 memcpy(ctx->buf + offs, subs, s_to) ;
446 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
447 len + 1 - s_from - offs) ;
451 s_from = strlen(m->from.c) ;
452 if ( ! strncasecmp(ctx->buf, m->from.c, s_from ) ) {
454 s_to = strlen(m->to) ;
455 len = strlen(ctx->buf) ;
456 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
457 "H: matched %s, substituting %s", m->from.c, m->to) ) ;
458 if ( s_to > s_from ) {
459 preserve(ctx, s_to - s_from) ;
460 memmove(ctx->buf+s_to, ctx->buf+s_from,
462 memcpy(ctx->buf, m->to, s_to) ;
463 } else { /* it fits in the existing space */
464 memcpy(ctx->buf, m->to, s_to) ;
465 memmove(ctx->buf+s_to, ctx->buf+s_from,
471 if ( num_match > 0 ) /* URIs only want one match */
476 for ( m = ctx->cfg->map ; m ; m = m->next ) {
477 num_match = 0 ; /* reset here since we're working per-rule */
478 if ( ! ( m->flags & M_EVENTS ) )
480 if ( m->flags & M_REGEX ) {
483 while ( ! ap_regexec(m->from.r, ctx->buf+offs,
484 nmatch, pmatch, 0) ) {
485 match = pmatch[0].rm_so ;
486 s_from = pmatch[0].rm_eo - match ;
487 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
490 const char* f = apr_pstrndup(ctx->f->r->pool,
491 ctx->buf + offs , s_from ) ;
492 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
493 "E/RX: match at %s, substituting %s", f, subs) ;
495 s_to = strlen(subs) ;
497 len = strlen(ctx->buf) ;
498 if ( s_to > s_from) {
499 preserve(ctx, s_to - s_from) ;
500 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
501 len + 1 - s_from - offs) ;
502 memcpy(ctx->buf+offs, subs, s_to) ;
504 memcpy(ctx->buf + offs, subs, s_to) ;
505 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
506 len + 1 - s_from - offs) ;
512 found = strstr(ctx->buf, m->from.c) ;
513 if ( (m->flags & M_ATSTART) && ( found != ctx->buf) )
516 s_from = strlen(m->from.c) ;
517 s_to = strlen(m->to) ;
518 match = found - ctx->buf ;
519 if ( ( s_from < strlen(found) ) && (m->flags & M_ATEND ) ) {
520 found = strstr(ctx->buf+match+s_from, m->from.c) ;
523 found = strstr(ctx->buf+match+s_to, m->from.c) ;
525 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
526 "E: matched %s, substituting %s", m->from.c, m->to) ) ;
527 len = strlen(ctx->buf) ;
528 if ( s_to > s_from ) {
529 preserve(ctx, s_to - s_from) ;
530 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
531 len + 1 - s_from - match) ;
532 memcpy(ctx->buf+match, m->to, s_to) ;
534 memcpy(ctx->buf+match, m->to, s_to) ;
535 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
536 len + 1 - s_from - match) ;
541 if ( num_match && ( m->flags & M_LAST ) )
550 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL) ;
553 if ( ctx->cfg->flags != 0 )
554 normalise(ctx->cfg->flags, ctx->buf) ;
556 /* write the attribute, using pcharacters to html-escape
557 anything that needs it in the value.
559 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL) ;
560 pcharacters(ctx, ctx->buf, strlen(ctx->buf)) ;
561 ap_fputc(ctx->f->next, ctx->bb, '"') ;
566 if ( is_empty_elt(name) )
567 ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag) ;
569 ap_fputc(ctx->f->next, ctx->bb, '>') ;
571 static htmlSAXHandlerPtr setupSAX(apr_pool_t* pool) {
572 htmlSAXHandlerPtr sax = apr_pcalloc(pool, sizeof(htmlSAXHandler) ) ;
573 sax->startDocument = NULL ;
574 sax->endDocument = NULL ;
575 sax->startElement = pstartElement ;
576 sax->endElement = pendElement ;
577 sax->characters = pcharacters ;
578 sax->comment = pcomment ;
579 sax->cdataBlock = pcdata ;
583 static ap_regex_t* seek_meta_ctype ;
584 static ap_regex_t* seek_charset ;
585 static ap_regex_t* seek_meta ;
587 static void proxy_html_child_init(apr_pool_t* pool, server_rec* s) {
588 seek_meta_ctype = ap_pregcomp(pool,
589 "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
590 AP_REG_EXTENDED|AP_REG_ICASE) ;
591 seek_charset = ap_pregcomp(pool, "charset=([A-Za-z0-9_-]+)",
592 AP_REG_EXTENDED|AP_REG_ICASE) ;
593 seek_meta = ap_pregcomp(pool, "<meta[^>]*(http-equiv)[^>]*>",
594 AP_REG_EXTENDED|AP_REG_ICASE) ;
597 static xmlCharEncoding sniff_encoding(request_rec* r, const char* cbuf, size_t bytes
602 xmlCharEncoding ret ;
603 char* encoding = NULL ;
606 ap_regmatch_t match[2] ;
607 unsigned char* buf = (unsigned char*)cbuf ;
609 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
610 "Content-Type is %s", r->content_type) ) ;
612 /* If we've got it in the HTTP headers, there's nothing to do */
613 if ( r->content_type &&
614 ( p = ap_strcasestr(r->content_type, "charset=") , p > 0 ) ) {
616 if ( encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ) , encoding ) {
617 if ( ret = xmlParseCharEncoding(encoding),
618 ret != XML_CHAR_ENCODING_ERROR ) {
619 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
620 "Got charset %s from HTTP headers", encoding) ) ;
623 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
624 "Unsupported charset %s in HTTP headers", encoding) ;
630 /* to sniff, first we look for BOM */
631 if ( ret = xmlDetectCharEncoding(buf, bytes),
632 ret != XML_CHAR_ENCODING_NONE ) {
633 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
634 "Got charset from XML rules.") ) ;
638 /* If none of the above, look for a META-thingey */
640 if ( ap_regexec(seek_meta_ctype, buf, 1, match, 0) == 0 ) {
641 p = apr_pstrndup(r->pool, buf + match[0].rm_so,
642 match[0].rm_eo - match[0].rm_so) ;
643 if ( ap_regexec(seek_charset, p, 2, match, 0) == 0 )
644 encoding = apr_pstrndup(r->pool, p+match[1].rm_so,
645 match[1].rm_eo - match[1].rm_so) ;
648 /* either it's set to something we found or it's still the default */
650 if ( ret = xmlParseCharEncoding(encoding),
651 ret != XML_CHAR_ENCODING_ERROR ) {
652 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
653 "Got charset %s from HTML META", encoding) ) ;
656 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
657 "Unsupported charset %s in HTML META", encoding) ;
660 /* the old HTTP default is a last resort */
661 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r,
662 "No usable charset information: using old HTTP default LATIN1") ;
663 return XML_CHAR_ENCODING_8859_1 ;
665 static meta* metafix(request_rec* r, const char* buf /*, size_t bytes*/
676 ap_regmatch_t pmatch[2] ;
679 while ( ! ap_regexec(seek_meta, buf+offs, 2, pmatch, 0) ) {
682 p = buf+offs+pmatch[1].rm_eo ;
683 while ( !isalpha(*++p) ) ;
684 for ( q = p ; isalnum(*q) || (*q == '-') ; ++q ) ;
685 header = apr_pstrndup(r->pool, p, q-p) ;
686 if ( strncasecmp(header, "Content-", 8) ) {
687 /* find content=... string */
688 for ( p = strstr(buf+offs+pmatch[0].rm_so, "content") ; *p ; ) {
690 while ( *p && isspace(*p) )
694 while ( *p && isspace(*++p) ) ;
695 if ( ( *p == '\'' ) || ( *p == '"' ) ) {
697 for ( q = p ; *q != delim ; ++q ) ;
699 for ( q = p ; *q && !isspace(*q) && (*q != '>') ; ++q ) ;
701 content = apr_pstrndup(r->pool, p, q-p) ;
704 } else if ( !strncasecmp(header, "Content-Type", 12) ) {
705 ret = apr_palloc(r->pool, sizeof(meta) ) ;
706 ret->start = pmatch[0].rm_so ;
707 ret->end = pmatch[0].rm_eo ;
709 if ( header && content ) {
710 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
711 "Adding header [%s: %s] from HTML META", header, content) ) ;
712 apr_table_setn(r->headers_out, header, content) ;
714 offs += pmatch[0].rm_eo ;
719 static int proxy_html_filter_init(ap_filter_t* f) {
724 /* remove content-length filter */
725 ap_filter_rec_t* clf = ap_get_output_filter_handle("CONTENT_LENGTH") ;
726 ap_filter_t* ff = f->next ;
729 ap_filter_t* fnext = ff->next ;
730 if ( ff->frec == clf )
731 ap_remove_output_filter(ff) ;
736 fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt)) ;
737 fctx->sax = setupSAX(f->r->pool) ;
739 fctx->bb = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc) ;
740 fctx->cfg = ap_get_module_config(f->r->per_dir_config,&proxy_html_module);
742 if ( f->r->proto_num >= 1001 ) {
743 if ( ! f->r->main && ! f->r->prev ) {
744 env = apr_table_get(f->r->subprocess_env, "force-response-1.0") ;
750 apr_table_unset(f->r->headers_out, "Content-Length") ;
751 apr_table_unset(f->r->headers_out, "ETag") ;
754 static saxctxt* check_filter_init (ap_filter_t* f) {
756 const char* errmsg = NULL ;
757 if ( ! f->r->proxyreq ) {
758 errmsg = "Non-proxy request; not inserting proxy-html filter" ;
759 } else if ( ! f->r->content_type ) {
760 errmsg = "No content-type; bailing out of proxy-html filter" ;
761 } else if ( strncasecmp(f->r->content_type, "text/html", 9) &&
762 strncasecmp(f->r->content_type, "application/xhtml+xml", 21) ) {
763 errmsg = "Non-HTML content; not inserting proxy-html filter" ;
769 = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
770 if ( cfg->verbose ) {
771 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, f->r, errmsg) ;
774 ap_remove_output_filter(f) ;
778 proxy_html_filter_init(f) ;
781 static int proxy_html_filter(ap_filter_t* f, apr_bucket_brigade* bb) {
784 xmlCharEncoding enc ;
785 const char* buf = 0 ;
786 apr_size_t bytes = 0 ;
787 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
788 XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING ;
790 saxctxt* ctxt = check_filter_init(f) ;
792 return ap_pass_brigade(f->next, bb) ;
794 for ( b = APR_BRIGADE_FIRST(bb) ;
795 b != APR_BRIGADE_SENTINEL(bb) ;
796 b = APR_BUCKET_NEXT(b) ) {
797 if ( APR_BUCKET_IS_EOS(b) ) {
798 if ( ctxt->parser != NULL ) {
799 htmlParseChunk(ctxt->parser, buf, 0, 1) ;
801 APR_BRIGADE_INSERT_TAIL(ctxt->bb,
802 apr_bucket_eos_create(ctxt->bb->bucket_alloc) ) ;
803 ap_pass_brigade(ctxt->f->next, ctxt->bb) ;
804 } else if ( apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
806 if ( ctxt->parser == NULL ) {
807 if ( buf[bytes] != 0 ) {
808 /* make a string for parse routines to play with */
809 char* buf1 = apr_palloc(f->r->pool, bytes+1) ;
810 memcpy(buf1, buf, bytes) ;
815 enc = sniff_encoding(f->r, buf, bytes, ctxt->cfg->verbose) ;
816 if ( ctxt->cfg->metafix )
817 m = metafix(f->r, buf, ctxt->cfg->verbose) ;
819 enc = sniff_encoding(f->r, buf, bytes) ;
820 if ( ctxt->cfg->metafix )
821 m = metafix(f->r, buf) ;
823 ap_set_content_type(f->r, "text/html;charset=utf-8") ;
824 ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype) ;
826 ctxt->parser = htmlCreatePushParserCtxt(ctxt->sax, ctxt,
827 buf, m->start, 0, enc ) ;
828 htmlParseChunk(ctxt->parser, buf+m->end, bytes-m->end, 0) ;
830 ctxt->parser = htmlCreatePushParserCtxt(ctxt->sax, ctxt,
831 buf, bytes, 0, enc ) ;
833 apr_pool_cleanup_register(f->r->pool, ctxt->parser,
834 (void*)htmlFreeParserCtxt, apr_pool_cleanup_null) ;
835 if ( xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts ), xmlopts )
836 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
837 "Unsupported parser opts %x", xmlopts) ;
839 htmlParseChunk(ctxt->parser, buf, bytes, 0) ;
842 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, "Error in bucket read") ;
845 /*ap_fflush(ctxt->f->next, ctxt->bb) ; // uncomment for debug */
846 apr_brigade_cleanup(bb) ;
849 static const char* fpi_html =
850 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n" ;
851 static const char* fpi_html_legacy =
852 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" ;
853 static const char* fpi_xhtml =
854 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ;
855 static const char* fpi_xhtml_legacy =
856 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" ;
857 static const char* html_etag = ">" ;
858 static const char* xhtml_etag = " />" ;
859 /*#define DEFAULT_DOCTYPE fpi_html */
860 static const char* DEFAULT_DOCTYPE = "" ;
861 #define DEFAULT_ETAG html_etag
863 static void* proxy_html_config(apr_pool_t* pool, char* x) {
864 proxy_html_conf* ret = apr_pcalloc(pool, sizeof(proxy_html_conf) ) ;
865 ret->doctype = DEFAULT_DOCTYPE ;
866 ret->etag = DEFAULT_ETAG ;
870 static void* proxy_html_merge(apr_pool_t* pool, void* BASE, void* ADD) {
871 proxy_html_conf* base = (proxy_html_conf*) BASE ;
872 proxy_html_conf* add = (proxy_html_conf*) ADD ;
873 proxy_html_conf* conf = apr_palloc(pool, sizeof(proxy_html_conf)) ;
875 if ( add->map && base->map ) {
878 for ( a = base->map ; a ; a = a->next ) {
879 urlmap* save = conf->map ;
880 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
881 conf->map->next = save ;
883 for ( a = add->map ; a ; a = a->next ) {
884 urlmap* save = conf->map ;
885 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
886 conf->map->next = save ;
889 conf->map = add->map ? add->map : base->map ;
891 conf->doctype = ( add->doctype == DEFAULT_DOCTYPE )
892 ? base->doctype : add->doctype ;
893 conf->etag = ( add->etag == DEFAULT_ETAG ) ? base->etag : add->etag ;
894 conf->bufsz = add->bufsz ;
895 if ( add->flags & NORM_RESET ) {
896 conf->flags = add->flags ^ NORM_RESET ;
897 conf->metafix = add->metafix ;
898 conf->extfix = add->extfix ;
899 conf->strip_comments = add->strip_comments ;
901 conf->verbose = add->verbose ;
904 conf->flags = base->flags | add->flags ;
905 conf->metafix = base->metafix | add->metafix ;
906 conf->extfix = base->extfix | add->extfix ;
907 conf->strip_comments = base->strip_comments | add->strip_comments ;
909 conf->verbose = base->verbose | add->verbose ;
914 #define REGFLAG(n,s,c) ( (s&&(strchr((s),(c))!=NULL)) ? (n) : 0 )
915 #define XREGFLAG(n,s,c) ( (!s||(strchr((s),(c))==NULL)) ? (n) : 0 )
916 static const char* set_urlmap(cmd_parms* cmd, void* CFG,
917 const char* from, const char* to, const char* flags) {
919 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
921 urlmap* newmap = apr_palloc(cmd->pool, sizeof(urlmap) ) ;
923 newmap->next = NULL ;
925 = XREGFLAG(M_HTML,flags,'h')
926 | XREGFLAG(M_EVENTS,flags,'e')
927 | XREGFLAG(M_CDATA,flags,'c')
928 | REGFLAG(M_ATSTART,flags,'^')
929 | REGFLAG(M_ATEND,flags,'$')
930 | REGFLAG(M_REGEX,flags,'R')
931 | REGFLAG(M_LAST,flags,'L')
935 for ( map = cfg->map ; map->next ; map = map->next ) ;
940 if ( ! (newmap->flags & M_REGEX) ) {
941 newmap->from.c = apr_pstrdup(cmd->pool, from) ;
942 newmap->to = apr_pstrdup(cmd->pool, to) ;
945 = REGFLAG(AP_REG_EXTENDED,flags,'x')
946 | REGFLAG(AP_REG_ICASE,flags,'i')
947 | REGFLAG(AP_REG_NOSUB,flags,'n')
948 | REGFLAG(AP_REG_NEWLINE,flags,'s')
950 newmap->from.r = ap_pregcomp(cmd->pool, from, regflags) ;
951 newmap->to = apr_pstrdup(cmd->pool, to) ;
955 static const char* set_doctype(cmd_parms* cmd, void* CFG, const char* t,
957 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
958 if ( !strcasecmp(t, "xhtml") ) {
959 cfg->etag = xhtml_etag ;
960 if ( l && !strcasecmp(l, "legacy") )
961 cfg->doctype = fpi_xhtml_legacy ;
963 cfg->doctype = fpi_xhtml ;
964 } else if ( !strcasecmp(t, "html") ) {
965 cfg->etag = html_etag ;
966 if ( l && !strcasecmp(l, "legacy") )
967 cfg->doctype = fpi_html_legacy ;
969 cfg->doctype = fpi_html ;
971 cfg->doctype = apr_pstrdup(cmd->pool, t) ;
972 if ( l && ( ( l[0] == 'x' ) || ( l[0] == 'X' ) ) )
973 cfg->etag = xhtml_etag ;
975 cfg->etag = html_etag ;
979 static void set_param(proxy_html_conf* cfg, const char* arg) {
981 if ( !strcmp(arg, "lowercase") )
982 cfg->flags |= NORM_LC ;
983 else if ( !strcmp(arg, "dospath") )
984 cfg->flags |= NORM_MSSLASH ;
985 else if ( !strcmp(arg, "reset") )
986 cfg->flags |= NORM_RESET ;
989 static const char* set_flags(cmd_parms* cmd, void* CFG, const char* arg1,
990 const char* arg2, const char* arg3) {
991 set_param( (proxy_html_conf*)CFG, arg1) ;
992 set_param( (proxy_html_conf*)CFG, arg2) ;
993 set_param( (proxy_html_conf*)CFG, arg3) ;
996 static const command_rec proxy_html_cmds[] = {
997 AP_INIT_TAKE23("ProxyHTMLURLMap", set_urlmap, NULL,
998 RSRC_CONF|ACCESS_CONF, "Map URL From To" ) ,
999 AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1000 RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]" ) ,
1001 AP_INIT_TAKE123("ProxyHTMLFixups", set_flags, NULL,
1002 RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath" ) ,
1003 AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1004 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1005 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements" ) ,
1006 AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1007 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1008 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS" ) ,
1009 AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1010 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1011 RSRC_CONF|ACCESS_CONF, "Strip out comments" ) ,
1013 AP_INIT_FLAG("ProxyHTMLLogVerbose", ap_set_flag_slot,
1014 (void*)APR_OFFSETOF(proxy_html_conf, verbose),
1015 RSRC_CONF|ACCESS_CONF, "Verbose Logging (use with LogLevel Info)" ) ,
1017 AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1018 (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1019 RSRC_CONF|ACCESS_CONF, "Buffer size" ) ,
1022 static int mod_proxy_html(apr_pool_t* p, apr_pool_t* p1, apr_pool_t* p2,
1024 ap_add_version_component(p, VERSION_STRING) ;
1027 static void proxy_html_hooks(apr_pool_t* p) {
1028 ap_register_output_filter("proxy-html", proxy_html_filter,
1029 NULL, AP_FTYPE_RESOURCE) ;
1030 ap_hook_post_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE) ;
1031 ap_hook_child_init(proxy_html_child_init, NULL, NULL, APR_HOOK_MIDDLE) ;
1033 module AP_MODULE_DECLARE_DATA proxy_html_module = {
1034 STANDARD20_MODULE_STUFF,