1 /********************************************************************
2 Copyright (c) 2003-5, WebThing Ltd
3 Author: Nick Kew <nick@webthing.com>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *********************************************************************/
22 /********************************************************************
25 You are requested to register as a user, at
26 http://apache.webthing.com/registration.html
28 This entitles you to support from the developer.
29 I'm unlikely to reply to help/support requests from
30 non-registered users, unless you're paying and/or offering
31 constructive feedback such as bug reports or sensible
32 suggestions for further development.
34 It also makes a small contribution to the effort
35 that's gone into developing this work.
36 *********************************************************************/
43 You can #define GO_FASTER to disable informational logging.
44 This disables the ProxyHTMLLogVerbose option altogether.
46 Default is to leave it undefined, and enable verbose logging
47 as a configuration option. Binaries are supplied with verbose
54 #define VERBOSE(x) if ( verbose ) x
57 #define VERSION_STRING "proxy_html/2.5"
62 #include <libxml/HTMLparser.h>
65 #include <http_protocol.h>
66 #include <http_config.h>
68 #include <apr_strings.h>
70 /* To support Apache 2.1/2.2, we need the ap_ forms of the
71 * regexp stuff, and they're now used in the code.
72 * To support 2.0 in the same compile, * we #define the
73 * AP_ versions if necessary.
76 /* it's 2.0, so we #define the ap_ versions */
77 #define ap_regex_t regex_t
78 #define ap_regmatch_t regmatch_t
79 #define AP_REG_EXTENDED REG_EXTENDED
80 #define AP_REG_ICASE REG_ICASE
81 #define AP_REG_NOSUB REG_NOSUB
82 #define AP_REG_NEWLINE REG_NEWLINE
85 module AP_MODULE_DECLARE_DATA proxy_html_module ;
91 #define M_ATSTART 0x10
99 typedef struct urlmap {
100 struct urlmap* next ;
110 const char* doctype ;
122 htmlSAXHandlerPtr sax ;
124 proxy_html_conf* cfg ;
125 htmlParserCtxtPtr parser ;
126 apr_bucket_brigade* bb ;
132 static int is_empty_elt(const char* name) {
134 static const char* empty_elts[] = {
150 for ( p = empty_elts ; *p ; ++p )
151 if ( !strcmp( *p, name) )
162 #define NORM_MSSLASH 0x2
163 #define NORM_RESET 0x4
165 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t ;
167 static void normalise(unsigned int flags, char* str) {
169 if ( flags & NORM_LC )
170 for ( p = str ; *p ; ++p )
174 if ( flags & NORM_MSSLASH )
175 for ( p = strchr(str, '\\') ; p ; p = strchr(p+1, '\\') )
180 #define FLUSH ap_fwrite(ctx->f->next, ctx->bb, (chars+begin), (i-begin)) ; begin = i+1
181 static void pcharacters(void* ctxt, const xmlChar *chars, int length) {
182 saxctxt* ctx = (saxctxt*) ctxt ;
185 for ( begin=i=0; i<length; i++ ) {
187 case '&' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "&") ; break ;
188 case '<' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "<") ; break ;
189 case '>' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, ">") ; break ;
190 case '"' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, """) ; break ;
196 static void preserve(saxctxt* ctx, const size_t len) {
198 if ( len <= ( ctx->avail - ctx->offset ) )
200 else while ( len > ( ctx->avail - ctx->offset ) )
201 ctx->avail += ctx->cfg->bufsz ;
203 newbuf = realloc(ctx->buf, ctx->avail) ;
204 if ( newbuf != ctx->buf ) {
206 apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf, (void*)free) ;
207 apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
208 (void*)free, apr_pool_cleanup_null);
212 static void pappend(saxctxt* ctx, const char* buf, const size_t len) {
214 memcpy(ctx->buf+ctx->offset, buf, len) ;
217 static void dump_content(saxctxt* ctx) {
220 size_t s_from, s_to ;
224 ap_regmatch_t pmatch[10] ;
228 int verbose = ctx->cfg->verbose ;
231 pappend(ctx, &c, 1) ; /* append null byte */
232 /* parse the text for URLs */
233 for ( m = ctx->cfg->map ; m ; m = m->next ) {
234 if ( ! ( m->flags & M_CDATA ) )
236 if ( m->flags & M_REGEX ) {
239 while ( ! ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0) ) {
240 match = pmatch[0].rm_so ;
241 s_from = pmatch[0].rm_eo - match ;
242 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
244 s_to = strlen(subs) ;
245 len = strlen(ctx->buf) ;
248 const char* f = apr_pstrndup(ctx->f->r->pool,
249 ctx->buf + offs , s_from ) ;
250 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
251 "C/RX: match at %s, substituting %s", f, subs) ;
253 if ( s_to > s_from) {
254 preserve(ctx, s_to - s_from) ;
255 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
256 len + 1 - s_from - offs) ;
257 memcpy(ctx->buf+offs, subs, s_to) ;
259 memcpy(ctx->buf + offs, subs, s_to) ;
260 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
261 len + 1 - s_from - offs) ;
266 s_from = strlen(m->from.c) ;
267 s_to = strlen(m->to) ;
268 for ( found = strstr(ctx->buf, m->from.c) ; found ;
269 found = strstr(ctx->buf+match+s_to, m->from.c) ) {
270 match = found - ctx->buf ;
271 if ( ( m->flags & M_ATSTART ) && ( match != 0) )
273 len = strlen(ctx->buf) ;
274 if ( ( m->flags & M_ATEND ) && ( match < (len - s_from) ) )
276 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
277 "C: matched %s, substituting %s", m->from.c, m->to) ) ;
278 if ( s_to > s_from ) {
279 preserve(ctx, s_to - s_from) ;
280 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
281 len + 1 - s_from - match) ;
282 memcpy(ctx->buf+match, m->to, s_to) ;
284 memcpy(ctx->buf+match, m->to, s_to) ;
285 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
286 len + 1 - s_from - match) ;
291 ap_fputs(ctx->f->next, ctx->bb, ctx->buf) ;
293 static void pcdata(void* ctxt, const xmlChar *chars, int length) {
294 saxctxt* ctx = (saxctxt*) ctxt ;
295 if ( ctx->cfg->extfix ) {
296 pappend(ctx, chars, length) ;
298 ap_fwrite(ctx->f->next, ctx->bb, chars, length) ;
301 static void pcomment(void* ctxt, const xmlChar *chars) {
302 saxctxt* ctx = (saxctxt*) ctxt ;
303 if ( ctx->cfg->strip_comments )
306 if ( ctx->cfg->extfix ) {
307 pappend(ctx, "<!--", 4) ;
308 pappend(ctx, chars, strlen(chars) ) ;
309 pappend(ctx, "-->", 3) ;
311 ap_fputstrs(ctx->f->next, ctx->bb, "<!--", chars, "-->", NULL) ;
314 static void pendElement(void* ctxt, const xmlChar* name) {
315 saxctxt* ctx = (saxctxt*) ctxt ;
316 if ( ctx->offset > 0 ) {
318 ctx->offset = 0 ; /* having dumped it, we can re-use the memory */
320 if ( ! is_empty_elt(name) )
321 ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name) ;
323 static void pstartElement(void* ctxt, const xmlChar* name,
324 const xmlChar** attrs ) {
330 const char** linkattrs ;
333 const char** linkattr ;
335 size_t s_to, s_from, match ;
337 saxctxt* ctx = (saxctxt*) ctxt ;
339 ap_regmatch_t pmatch[10] ;
341 int verbose = ctx->cfg->verbose ;
344 static const char* href[] = { "href", NULL } ;
345 static const char* cite[] = { "cite", NULL } ;
346 static const char* action[] = { "action", NULL } ;
347 static const char* imgattr[] = { "src", "longdesc", "usemap", NULL } ;
348 static const char* inputattr[] = { "src", "usemap", NULL } ;
349 static const char* scriptattr[] = { "src", "for", NULL } ;
350 static const char* frameattr[] = { "src", "longdesc", NULL } ;
351 static const char* objattr[] =
352 { "classid", "codebase", "data", "usemap", NULL } ;
353 static const char* profile[] = { "profile", NULL } ;
354 static const char* background[] = { "background", NULL } ;
355 static const char* codebase[] = { "codebase", NULL } ;
357 static const elt_t linked_elts[] = {
359 { "img" , imgattr } ,
362 { "script" , scriptattr } ,
365 { "input" , inputattr } ,
366 { "frame", frameattr } ,
367 { "iframe", frameattr } ,
368 { "object", objattr } ,
370 { "blockquote" , cite } ,
373 { "head" , profile } ,
374 { "body" , background } ,
375 { "applet", codebase } ,
378 static const char* events[] = {
400 ap_fputc(ctx->f->next, ctx->bb, '<') ;
401 ap_fputs(ctx->f->next, ctx->bb, name) ;
405 for ( elt = linked_elts; elt->name != NULL ; ++elt )
406 if ( !strcmp(elt->name, name) ) {
407 linkattrs = elt->attrs ;
410 for ( a = attrs ; *a ; a += 2 ) {
413 pappend(ctx, a[1], strlen(a[1])+1) ;
414 is_uri = ATTR_IGNORE ;
416 for ( linkattr = linkattrs ; *linkattr ; ++linkattr) {
417 if ( !strcmp(*linkattr, *a) ) {
423 if ( (is_uri == ATTR_IGNORE) && ctx->cfg->extfix ) {
424 for ( linkattr = events; *linkattr; ++linkattr ) {
425 if ( !strcmp(*linkattr, *a) ) {
426 is_uri = ATTR_EVENT ;
434 for ( m = ctx->cfg->map ; m ; m = m->next ) {
435 if ( ! ( m->flags & M_HTML ) )
437 if ( m->flags & M_REGEX ) {
439 if ( ! ap_regexec(m->from.r, ctx->buf, nmatch, pmatch, 0) ) {
441 offs = match = pmatch[0].rm_so ;
442 s_from = pmatch[0].rm_eo - match ;
443 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
446 const char* f = apr_pstrndup(ctx->f->r->pool,
447 ctx->buf + offs , s_from ) ;
448 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
449 "H/RX: match at %s, substituting %s", f, subs) ;
451 s_to = strlen(subs) ;
452 len = strlen(ctx->buf) ;
453 if ( s_to > s_from) {
454 preserve(ctx, s_to - s_from) ;
455 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
456 len + 1 - s_from - offs) ;
457 memcpy(ctx->buf+offs, subs, s_to) ;
459 memcpy(ctx->buf + offs, subs, s_to) ;
460 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
461 len + 1 - s_from - offs) ;
465 s_from = strlen(m->from.c) ;
466 if ( ! strncasecmp(ctx->buf, m->from.c, s_from ) ) {
468 s_to = strlen(m->to) ;
469 len = strlen(ctx->buf) ;
470 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
471 "H: matched %s, substituting %s", m->from.c, m->to) ) ;
472 if ( s_to > s_from ) {
473 preserve(ctx, s_to - s_from) ;
474 memmove(ctx->buf+s_to, ctx->buf+s_from,
476 memcpy(ctx->buf, m->to, s_to) ;
477 } else { /* it fits in the existing space */
478 memcpy(ctx->buf, m->to, s_to) ;
479 memmove(ctx->buf+s_to, ctx->buf+s_from,
485 if ( num_match > 0 ) /* URIs only want one match */
490 for ( m = ctx->cfg->map ; m ; m = m->next ) {
491 num_match = 0 ; /* reset here since we're working per-rule */
492 if ( ! ( m->flags & M_EVENTS ) )
494 if ( m->flags & M_REGEX ) {
497 while ( ! ap_regexec(m->from.r, ctx->buf+offs,
498 nmatch, pmatch, 0) ) {
499 match = pmatch[0].rm_so ;
500 s_from = pmatch[0].rm_eo - match ;
501 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
504 const char* f = apr_pstrndup(ctx->f->r->pool,
505 ctx->buf + offs , s_from ) ;
506 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
507 "E/RX: match at %s, substituting %s", f, subs) ;
509 s_to = strlen(subs) ;
511 len = strlen(ctx->buf) ;
512 if ( s_to > s_from) {
513 preserve(ctx, s_to - s_from) ;
514 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
515 len + 1 - s_from - offs) ;
516 memcpy(ctx->buf+offs, subs, s_to) ;
518 memcpy(ctx->buf + offs, subs, s_to) ;
519 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
520 len + 1 - s_from - offs) ;
526 found = strstr(ctx->buf, m->from.c) ;
527 if ( (m->flags & M_ATSTART) && ( found != ctx->buf) )
530 s_from = strlen(m->from.c) ;
531 s_to = strlen(m->to) ;
532 match = found - ctx->buf ;
533 if ( ( s_from < strlen(found) ) && (m->flags & M_ATEND ) ) {
534 found = strstr(ctx->buf+match+s_from, m->from.c) ;
537 found = strstr(ctx->buf+match+s_to, m->from.c) ;
539 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
540 "E: matched %s, substituting %s", m->from.c, m->to) ) ;
541 len = strlen(ctx->buf) ;
542 if ( s_to > s_from ) {
543 preserve(ctx, s_to - s_from) ;
544 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
545 len + 1 - s_from - match) ;
546 memcpy(ctx->buf+match, m->to, s_to) ;
548 memcpy(ctx->buf+match, m->to, s_to) ;
549 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
550 len + 1 - s_from - match) ;
555 if ( num_match && ( m->flags & M_LAST ) )
564 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL) ;
567 if ( ctx->cfg->flags != 0 )
568 normalise(ctx->cfg->flags, ctx->buf) ;
570 /* write the attribute, using pcharacters to html-escape
571 anything that needs it in the value.
573 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL) ;
574 pcharacters(ctx, ctx->buf, strlen(ctx->buf)) ;
575 ap_fputc(ctx->f->next, ctx->bb, '"') ;
580 if ( is_empty_elt(name) )
581 ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag) ;
583 ap_fputc(ctx->f->next, ctx->bb, '>') ;
585 static htmlSAXHandlerPtr setupSAX(apr_pool_t* pool) {
586 htmlSAXHandlerPtr sax = apr_pcalloc(pool, sizeof(htmlSAXHandler) ) ;
587 sax->startDocument = NULL ;
588 sax->endDocument = NULL ;
589 sax->startElement = pstartElement ;
590 sax->endElement = pendElement ;
591 sax->characters = pcharacters ;
592 sax->comment = pcomment ;
593 sax->cdataBlock = pcdata ;
597 static ap_regex_t* seek_meta_ctype ;
598 static ap_regex_t* seek_charset ;
599 static ap_regex_t* seek_meta ;
601 static void proxy_html_child_init(apr_pool_t* pool, server_rec* s) {
602 seek_meta_ctype = ap_pregcomp(pool,
603 "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
604 AP_REG_EXTENDED|AP_REG_ICASE) ;
605 seek_charset = ap_pregcomp(pool, "charset=([A-Za-z0-9_-]+)",
606 AP_REG_EXTENDED|AP_REG_ICASE) ;
607 seek_meta = ap_pregcomp(pool, "<meta[^>]*(http-equiv)[^>]*>",
608 AP_REG_EXTENDED|AP_REG_ICASE) ;
611 static xmlCharEncoding sniff_encoding(
612 request_rec* r, const char* cbuf, size_t bytes
617 xmlCharEncoding ret ;
618 char* encoding = NULL ;
620 ap_regmatch_t match[2] ;
621 unsigned char* buf = (unsigned char*)cbuf ;
623 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
624 "Content-Type is %s", r->content_type) ) ;
626 /* If we've got it in the HTTP headers, there's nothing to do */
627 if ( r->content_type &&
628 ( p = ap_strcasestr(r->content_type, "charset=") , p > 0 ) ) {
630 if ( encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ) , encoding ) {
631 if ( ret = xmlParseCharEncoding(encoding),
632 ret != XML_CHAR_ENCODING_ERROR ) {
633 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
634 "Got charset %s from HTTP headers", encoding) ) ;
637 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
638 "Unsupported charset %s in HTTP headers", encoding) ;
644 /* to sniff, first we look for BOM */
645 if ( ret = xmlDetectCharEncoding(buf, bytes),
646 ret != XML_CHAR_ENCODING_NONE ) {
647 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
648 "Got charset from XML rules.") ) ;
652 /* If none of the above, look for a META-thingey */
654 if ( ap_regexec(seek_meta_ctype, buf, 1, match, 0) == 0 ) {
655 p = apr_pstrndup(r->pool, buf + match[0].rm_so,
656 match[0].rm_eo - match[0].rm_so) ;
657 if ( ap_regexec(seek_charset, p, 2, match, 0) == 0 )
658 encoding = apr_pstrndup(r->pool, p+match[1].rm_so,
659 match[1].rm_eo - match[1].rm_so) ;
662 /* either it's set to something we found or it's still the default */
664 if ( ret = xmlParseCharEncoding(encoding),
665 ret != XML_CHAR_ENCODING_ERROR ) {
666 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
667 "Got charset %s from HTML META", encoding) ) ;
670 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
671 "Unsupported charset %s in HTML META", encoding) ;
674 /* the old HTTP default is a last resort */
675 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r,
676 "No usable charset information: using old HTTP default LATIN1") ;
677 return XML_CHAR_ENCODING_8859_1 ;
679 static meta* metafix(request_rec* r, const char* buf /*, size_t bytes*/
690 ap_regmatch_t pmatch[2] ;
693 while ( ! ap_regexec(seek_meta, buf+offs, 2, pmatch, 0) ) {
696 p = buf+offs+pmatch[1].rm_eo ;
697 while ( !isalpha(*++p) ) ;
698 for ( q = p ; isalnum(*q) || (*q == '-') ; ++q ) ;
699 header = apr_pstrndup(r->pool, p, q-p) ;
700 if ( strncasecmp(header, "Content-", 8) ) {
701 /* find content=... string */
702 for ( p = ap_strstr((char*)buf+offs+pmatch[0].rm_so, "content") ; *p ; ) {
704 while ( *p && isspace(*p) )
708 while ( *p && isspace(*++p) ) ;
709 if ( ( *p == '\'' ) || ( *p == '"' ) ) {
711 for ( q = p ; *q != delim ; ++q ) ;
713 for ( q = p ; *q && !isspace(*q) && (*q != '>') ; ++q ) ;
715 content = apr_pstrndup(r->pool, p, q-p) ;
718 } else if ( !strncasecmp(header, "Content-Type", 12) ) {
719 ret = apr_palloc(r->pool, sizeof(meta) ) ;
720 ret->start = pmatch[0].rm_so ;
721 ret->end = pmatch[0].rm_eo ;
723 if ( header && content ) {
724 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
725 "Adding header [%s: %s] from HTML META", header, content) ) ;
726 apr_table_setn(r->headers_out, header, content) ;
728 offs += pmatch[0].rm_eo ;
733 static int proxy_html_filter_init(ap_filter_t* f) {
738 /* remove content-length filter */
739 ap_filter_rec_t* clf = ap_get_output_filter_handle("CONTENT_LENGTH") ;
740 ap_filter_t* ff = f->next ;
743 ap_filter_t* fnext = ff->next ;
744 if ( ff->frec == clf )
745 ap_remove_output_filter(ff) ;
750 fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt)) ;
751 fctx->sax = setupSAX(f->r->pool) ;
753 fctx->bb = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc) ;
754 fctx->cfg = ap_get_module_config(f->r->per_dir_config,&proxy_html_module);
756 if ( f->r->proto_num >= 1001 ) {
757 if ( ! f->r->main && ! f->r->prev ) {
758 env = apr_table_get(f->r->subprocess_env, "force-response-1.0") ;
764 apr_table_unset(f->r->headers_out, "Content-Length") ;
765 apr_table_unset(f->r->headers_out, "ETag") ;
768 static saxctxt* check_filter_init (ap_filter_t* f) {
770 const char* errmsg = NULL ;
771 if ( ! f->r->proxyreq ) {
772 errmsg = "Non-proxy request; not inserting proxy-html filter" ;
773 } else if ( ! f->r->content_type ) {
774 errmsg = "No content-type; bailing out of proxy-html filter" ;
775 } else if ( strncasecmp(f->r->content_type, "text/html", 9) &&
776 strncasecmp(f->r->content_type, "application/xhtml+xml", 21) ) {
777 errmsg = "Non-HTML content; not inserting proxy-html filter" ;
783 = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
784 if ( cfg->verbose ) {
785 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, f->r, errmsg) ;
788 ap_remove_output_filter(f) ;
792 proxy_html_filter_init(f) ;
795 static int proxy_html_filter(ap_filter_t* f, apr_bucket_brigade* bb) {
798 xmlCharEncoding enc ;
799 const char* buf = 0 ;
800 apr_size_t bytes = 0 ;
801 #ifndef USE_OLD_LIBXML2
802 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
803 XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING ;
806 saxctxt* ctxt = check_filter_init(f) ;
808 return ap_pass_brigade(f->next, bb) ;
810 for ( b = APR_BRIGADE_FIRST(bb) ;
811 b != APR_BRIGADE_SENTINEL(bb) ;
812 b = APR_BUCKET_NEXT(b) ) {
813 if ( APR_BUCKET_IS_EOS(b) ) {
814 if ( ctxt->parser != NULL ) {
815 htmlParseChunk(ctxt->parser, buf, 0, 1) ;
817 APR_BRIGADE_INSERT_TAIL(ctxt->bb,
818 apr_bucket_eos_create(ctxt->bb->bucket_alloc) ) ;
819 ap_pass_brigade(ctxt->f->next, ctxt->bb) ;
820 } else if ( ! APR_BUCKET_IS_METADATA(b) &&
821 apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
823 if ( ctxt->parser == NULL ) {
824 if ( buf && buf[bytes] != 0 ) {
825 /* make a string for parse routines to play with */
826 char* buf1 = apr_palloc(f->r->pool, bytes+1) ;
827 memcpy(buf1, buf, bytes) ;
832 enc = sniff_encoding(f->r, buf, bytes, ctxt->cfg->verbose) ;
833 if ( ctxt->cfg->metafix )
834 m = metafix(f->r, buf, ctxt->cfg->verbose) ;
836 enc = sniff_encoding(f->r, buf, bytes) ;
837 if ( ctxt->cfg->metafix )
838 m = metafix(f->r, buf) ;
840 ap_set_content_type(f->r, "text/html;charset=utf-8") ;
841 ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype) ;
843 ctxt->parser = htmlCreatePushParserCtxt(ctxt->sax, ctxt,
844 buf, m->start, 0, enc ) ;
845 htmlParseChunk(ctxt->parser, buf+m->end, bytes-m->end, 0) ;
847 ctxt->parser = htmlCreatePushParserCtxt(ctxt->sax, ctxt,
848 buf, bytes, 0, enc ) ;
850 apr_pool_cleanup_register(f->r->pool, ctxt->parser,
851 (void*)htmlFreeParserCtxt, apr_pool_cleanup_null) ;
852 #ifndef USE_OLD_LIBXML2
853 if ( xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts ), xmlopts )
854 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
855 "Unsupported parser opts %x", xmlopts) ;
858 htmlParseChunk(ctxt->parser, buf, bytes, 0) ;
861 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, "Error in bucket read") ;
864 /*ap_fflush(ctxt->f->next, ctxt->bb) ; // uncomment for debug */
865 apr_brigade_cleanup(bb) ;
868 static const char* fpi_html =
869 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n" ;
870 static const char* fpi_html_legacy =
871 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" ;
872 static const char* fpi_xhtml =
873 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ;
874 static const char* fpi_xhtml_legacy =
875 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" ;
876 static const char* html_etag = ">" ;
877 static const char* xhtml_etag = " />" ;
878 /*#define DEFAULT_DOCTYPE fpi_html */
879 static const char* DEFAULT_DOCTYPE = "" ;
880 #define DEFAULT_ETAG html_etag
882 static void* proxy_html_config(apr_pool_t* pool, char* x) {
883 proxy_html_conf* ret = apr_pcalloc(pool, sizeof(proxy_html_conf) ) ;
884 ret->doctype = DEFAULT_DOCTYPE ;
885 ret->etag = DEFAULT_ETAG ;
889 static void* proxy_html_merge(apr_pool_t* pool, void* BASE, void* ADD) {
890 proxy_html_conf* base = (proxy_html_conf*) BASE ;
891 proxy_html_conf* add = (proxy_html_conf*) ADD ;
892 proxy_html_conf* conf = apr_palloc(pool, sizeof(proxy_html_conf)) ;
894 if ( add->map && base->map ) {
897 for ( a = base->map ; a ; a = a->next ) {
898 urlmap* save = conf->map ;
899 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
900 conf->map->next = save ;
902 for ( a = add->map ; a ; a = a->next ) {
903 urlmap* save = conf->map ;
904 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
905 conf->map->next = save ;
908 conf->map = add->map ? add->map : base->map ;
910 conf->doctype = ( add->doctype == DEFAULT_DOCTYPE )
911 ? base->doctype : add->doctype ;
912 conf->etag = ( add->etag == DEFAULT_ETAG ) ? base->etag : add->etag ;
913 conf->bufsz = add->bufsz ;
914 if ( add->flags & NORM_RESET ) {
915 conf->flags = add->flags ^ NORM_RESET ;
916 conf->metafix = add->metafix ;
917 conf->extfix = add->extfix ;
918 conf->strip_comments = add->strip_comments ;
920 conf->verbose = add->verbose ;
923 conf->flags = base->flags | add->flags ;
924 conf->metafix = base->metafix | add->metafix ;
925 conf->extfix = base->extfix | add->extfix ;
926 conf->strip_comments = base->strip_comments | add->strip_comments ;
928 conf->verbose = base->verbose | add->verbose ;
933 #define REGFLAG(n,s,c) ( (s&&(ap_strchr((char*)(s),(c))!=NULL)) ? (n) : 0 )
934 #define XREGFLAG(n,s,c) ( (!s||(ap_strchr((char*)(s),(c))==NULL)) ? (n) : 0 )
935 static const char* set_urlmap(cmd_parms* cmd, void* CFG,
936 const char* from, const char* to, const char* flags) {
938 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
940 urlmap* newmap = apr_palloc(cmd->pool, sizeof(urlmap) ) ;
942 newmap->next = NULL ;
944 = XREGFLAG(M_HTML,flags,'h')
945 | XREGFLAG(M_EVENTS,flags,'e')
946 | XREGFLAG(M_CDATA,flags,'c')
947 | REGFLAG(M_ATSTART,flags,'^')
948 | REGFLAG(M_ATEND,flags,'$')
949 | REGFLAG(M_REGEX,flags,'R')
950 | REGFLAG(M_LAST,flags,'L')
954 for ( map = cfg->map ; map->next ; map = map->next ) ;
959 if ( ! (newmap->flags & M_REGEX) ) {
960 newmap->from.c = apr_pstrdup(cmd->pool, from) ;
961 newmap->to = apr_pstrdup(cmd->pool, to) ;
964 = REGFLAG(AP_REG_EXTENDED,flags,'x')
965 | REGFLAG(AP_REG_ICASE,flags,'i')
966 | REGFLAG(AP_REG_NOSUB,flags,'n')
967 | REGFLAG(AP_REG_NEWLINE,flags,'s')
969 newmap->from.r = ap_pregcomp(cmd->pool, from, regflags) ;
970 newmap->to = apr_pstrdup(cmd->pool, to) ;
974 static const char* set_doctype(cmd_parms* cmd, void* CFG, const char* t,
976 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
977 if ( !strcasecmp(t, "xhtml") ) {
978 cfg->etag = xhtml_etag ;
979 if ( l && !strcasecmp(l, "legacy") )
980 cfg->doctype = fpi_xhtml_legacy ;
982 cfg->doctype = fpi_xhtml ;
983 } else if ( !strcasecmp(t, "html") ) {
984 cfg->etag = html_etag ;
985 if ( l && !strcasecmp(l, "legacy") )
986 cfg->doctype = fpi_html_legacy ;
988 cfg->doctype = fpi_html ;
990 cfg->doctype = apr_pstrdup(cmd->pool, t) ;
991 if ( l && ( ( l[0] == 'x' ) || ( l[0] == 'X' ) ) )
992 cfg->etag = xhtml_etag ;
994 cfg->etag = html_etag ;
998 static void set_param(proxy_html_conf* cfg, const char* arg) {
1000 if ( !strcmp(arg, "lowercase") )
1001 cfg->flags |= NORM_LC ;
1002 else if ( !strcmp(arg, "dospath") )
1003 cfg->flags |= NORM_MSSLASH ;
1004 else if ( !strcmp(arg, "reset") )
1005 cfg->flags |= NORM_RESET ;
1008 static const char* set_flags(cmd_parms* cmd, void* CFG, const char* arg1,
1009 const char* arg2, const char* arg3) {
1010 set_param( (proxy_html_conf*)CFG, arg1) ;
1011 set_param( (proxy_html_conf*)CFG, arg2) ;
1012 set_param( (proxy_html_conf*)CFG, arg3) ;
1015 static const command_rec proxy_html_cmds[] = {
1016 AP_INIT_TAKE23("ProxyHTMLURLMap", set_urlmap, NULL,
1017 RSRC_CONF|ACCESS_CONF, "Map URL From To" ) ,
1018 AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1019 RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]" ) ,
1020 AP_INIT_TAKE123("ProxyHTMLFixups", set_flags, NULL,
1021 RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath" ) ,
1022 AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1023 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1024 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements" ) ,
1025 AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1026 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1027 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS" ) ,
1028 AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1029 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1030 RSRC_CONF|ACCESS_CONF, "Strip out comments" ) ,
1032 AP_INIT_FLAG("ProxyHTMLLogVerbose", ap_set_flag_slot,
1033 (void*)APR_OFFSETOF(proxy_html_conf, verbose),
1034 RSRC_CONF|ACCESS_CONF, "Verbose Logging (use with LogLevel Info)" ) ,
1036 AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1037 (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1038 RSRC_CONF|ACCESS_CONF, "Buffer size" ) ,
1041 static int mod_proxy_html(apr_pool_t* p, apr_pool_t* p1, apr_pool_t* p2,
1043 ap_add_version_component(p, VERSION_STRING) ;
1046 static void proxy_html_hooks(apr_pool_t* p) {
1047 ap_register_output_filter("proxy-html", proxy_html_filter,
1048 NULL, AP_FTYPE_RESOURCE) ;
1049 ap_hook_post_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE) ;
1050 ap_hook_child_init(proxy_html_child_init, NULL, NULL, APR_HOOK_MIDDLE) ;
1052 module AP_MODULE_DECLARE_DATA proxy_html_module = {
1053 STANDARD20_MODULE_STUFF,