1 /********************************************************************
2 Copyright (c) 2003-9, WebThing Ltd
3 Author: Nick Kew <nick@webthing.com>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License Version 2,
7 as published by the Free Software Foundation.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You can obtain a copy of the GNU General Poblic License Version 2
15 from http://www.gnu.org/licenses/old-licenses/gpl-2.0.html or
16 http://apache.webthing.com/COPYING.txt
18 *********************************************************************/
20 /**** NOTICE TO PACKAGERS
22 * This module now relies on mod_xml2enc for i18n support.
23 * You should make mod_xml2enc a dependency in your packages.
33 You can #define GO_FASTER to disable informational logging.
34 This disables the ProxyHTMLLogVerbose option altogether.
36 Default is to leave it undefined, and enable verbose logging
37 as a configuration option. Binaries are supplied with verbose
45 #define VERBOSE(x) if (verbose) x
46 #define VERBOSEB(x) if (verbose) {x}
49 /* 3.1.2 - trivial changes to fix compile on Windows */
50 #define VERSION_STRING "proxy_html/3.1.2"
55 #include <libxml/HTMLparser.h>
58 #include <http_protocol.h>
59 #include <http_config.h>
61 #include <apr_strings.h>
63 #include <apr_strmatch.h>
65 #include <apr_optional.h>
66 #include <mod_xml2enc.h>
67 #include <http_request.h>
69 /* To support Apache 2.1/2.2, we need the ap_ forms of the
70 * regexp stuff, and they're now used in the code.
71 * To support 2.0 in the same compile, * we #define the
72 * AP_ versions if necessary.
75 /* it's 2.0, so we #define the ap_ versions */
76 #define ap_regex_t regex_t
77 #define ap_regmatch_t regmatch_t
78 #define AP_REG_EXTENDED REG_EXTENDED
79 #define AP_REG_ICASE REG_ICASE
80 #define AP_REG_NOSUB REG_NOSUB
81 #define AP_REG_NEWLINE REG_NEWLINE
83 #define ap_register_output_filter_protocol(a,b,c,d,e) ap_register_output_filter(a,b,c,d)
88 /* globals set once at startup */
89 static ap_regex_t* seek_meta ;
90 static const apr_strmatch_pattern* seek_content ;
91 static apr_status_t (*xml2enc_charset)(request_rec*, xmlCharEncoding*, const char**) = NULL;
92 static apr_status_t (*xml2enc_filter)(request_rec*, const char*, unsigned int) = NULL;
94 module AP_MODULE_DECLARE_DATA proxy_html_module ;
100 #define M_ATSTART 0x10
103 #define M_NOTLAST 0x80
104 #define M_INTERPOLATE_TO 0x100
105 #define M_INTERPOLATE_FROM 0x200
119 typedef struct urlmap {
120 struct urlmap* next ;
122 unsigned int regflags ;
132 const char* doctype ;
137 apr_array_header_t* events;
138 const char* charset_out;
150 proxy_html_conf* cfg ;
151 htmlParserCtxtPtr parser ;
152 apr_bucket_brigade* bb ;
156 const char* encoding;
162 #define NORM_MSSLASH 0x2
163 #define NORM_RESET 0x4
164 static htmlSAXHandler sax ;
166 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t ;
168 static const char* const fpi_html =
169 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n" ;
170 static const char* const fpi_html_legacy =
171 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" ;
172 static const char* const fpi_xhtml =
173 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ;
174 static const char* const fpi_xhtml_legacy =
175 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" ;
176 static const char* const html_etag = ">" ;
177 static const char* const xhtml_etag = " />" ;
178 /*#define DEFAULT_DOCTYPE fpi_html */
179 static const char* const DEFAULT_DOCTYPE = "" ;
180 #define DEFAULT_ETAG html_etag
182 static void normalise(unsigned int flags, char* str) {
184 if ( flags & NORM_LC )
185 for ( p = str ; *p ; ++p )
189 if ( flags & NORM_MSSLASH )
190 for ( p = ap_strchr(str, '\\') ; p ; p = ap_strchr(p+1, '\\') )
194 #define consume_buffer(ctx,inbuf,bytes,flag) \
195 htmlParseChunk(ctx->parser, inbuf, bytes, flag)
197 #define AP_fwrite(ctx,inbuf,bytes,flush) \
198 ap_fwrite(ctx->f->next, ctx->bb, inbuf, bytes);
200 /* This is always utf-8 on entry. We can convert charset within FLUSH */
201 #define FLUSH AP_fwrite(ctx, (chars+begin), (i-begin), 0) ; begin = i+1
202 static void pcharacters(void* ctxt, const xmlChar *uchars, int length) {
203 const char* chars = (const char*) uchars;
204 saxctxt* ctx = (saxctxt*) ctxt ;
207 for ( begin=i=0; i<length; i++ ) {
209 case '&' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "&") ; break ;
210 case '<' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "<") ; break ;
211 case '>' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, ">") ; break ;
212 case '"' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, """) ; break ;
218 static void preserve(saxctxt* ctx, const size_t len) {
220 if ( len <= ( ctx->avail - ctx->offset ) )
222 else while ( len > ( ctx->avail - ctx->offset ) )
223 ctx->avail += ctx->cfg->bufsz ;
225 newbuf = realloc(ctx->buf, ctx->avail) ;
226 if ( newbuf != ctx->buf ) {
228 apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf, (int(*)(void*))free);
229 apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
230 (int(*)(void*))free, apr_pool_cleanup_null);
234 static void pappend(saxctxt* ctx, const char* buf, const size_t len) {
236 memcpy(ctx->buf+ctx->offset, buf, len) ;
239 static void dump_content(saxctxt* ctx) {
242 size_t s_from, s_to ;
246 ap_regmatch_t pmatch[10] ;
249 urlmap* themap = ctx->map;
251 int verbose = ctx->cfg->verbose ;
254 pappend(ctx, &c, 1) ; /* append null byte */
255 /* parse the text for URLs */
256 for ( m = themap ; m ; m = m->next ) {
257 if ( ! ( m->flags & M_CDATA ) )
259 if ( m->flags & M_REGEX ) {
262 while ( ! ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0) ) {
263 match = pmatch[0].rm_so ;
264 s_from = pmatch[0].rm_eo - match ;
265 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
267 s_to = strlen(subs) ;
268 len = strlen(ctx->buf) ;
271 const char* f = apr_pstrndup(ctx->f->r->pool,
272 ctx->buf + offs , s_from ) ;
273 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
274 "C/RX: match at %s, substituting %s", f, subs) ;
276 if ( s_to > s_from) {
277 preserve(ctx, s_to - s_from) ;
278 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
279 len + 1 - s_from - offs) ;
280 memcpy(ctx->buf+offs, subs, s_to) ;
282 memcpy(ctx->buf + offs, subs, s_to) ;
283 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
284 len + 1 - s_from - offs) ;
289 s_from = strlen(m->from.c) ;
290 s_to = strlen(m->to) ;
291 for ( found = strstr(ctx->buf, m->from.c) ; found ;
292 found = strstr(ctx->buf+match+s_to, m->from.c) ) {
293 match = found - ctx->buf ;
294 if ( ( m->flags & M_ATSTART ) && ( match != 0) )
296 len = strlen(ctx->buf) ;
297 if ( ( m->flags & M_ATEND ) && ( match < (len - s_from) ) )
299 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
300 "C: matched %s, substituting %s", m->from.c, m->to) ) ;
301 if ( s_to > s_from ) {
302 preserve(ctx, s_to - s_from) ;
303 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
304 len + 1 - s_from - match) ;
305 memcpy(ctx->buf+match, m->to, s_to) ;
307 memcpy(ctx->buf+match, m->to, s_to) ;
308 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
309 len + 1 - s_from - match) ;
314 AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1) ;
316 static void pcdata(void* ctxt, const xmlChar *uchars, int length) {
317 const char* chars = (const char*) uchars;
318 saxctxt* ctx = (saxctxt*) ctxt ;
319 if ( ctx->cfg->extfix ) {
320 pappend(ctx, chars, length) ;
322 /* not sure if this should force-flush
323 * (i.e. can one cdata section come in multiple calls?)
325 AP_fwrite(ctx, chars, length, 0) ;
328 static void pcomment(void* ctxt, const xmlChar *uchars) {
329 const char* chars = (const char*) uchars;
330 saxctxt* ctx = (saxctxt*) ctxt ;
331 if ( ctx->cfg->strip_comments )
334 if ( ctx->cfg->extfix ) {
335 pappend(ctx, "<!--", 4) ;
336 pappend(ctx, chars, strlen(chars) ) ;
337 pappend(ctx, "-->", 3) ;
339 ap_fputs(ctx->f->next, ctx->bb, "<!--") ;
340 AP_fwrite(ctx, chars, strlen(chars), 1) ;
341 ap_fputs(ctx->f->next, ctx->bb, "-->") ;
344 static void pendElement(void* ctxt, const xmlChar* uname) {
345 saxctxt* ctx = (saxctxt*) ctxt ;
346 const char* name = (const char*) uname;
347 const htmlElemDesc* desc = htmlTagLookup(uname);
349 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
351 if (!desc || desc->depr)
354 } else if ((ctx->cfg->doctype == fpi_html)
355 || (ctx->cfg->doctype == fpi_xhtml)) {
356 /* enforce html legacy */
360 /* TODO - implement HTML "allowed here" using the stack */
361 /* nah. Keeping the stack is too much overhead */
363 if ( ctx->offset > 0 ) {
365 ctx->offset = 0 ; /* having dumped it, we can re-use the memory */
367 if ( !desc || ! desc->empty ) {
368 ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name) ;
371 static void pstartElement(void* ctxt, const xmlChar* uname,
372 const xmlChar** uattrs ) {
381 size_t s_to, s_from, match ;
383 saxctxt* ctx = (saxctxt*) ctxt ;
385 ap_regmatch_t pmatch[10] ;
387 int verbose = ctx->cfg->verbose ;
389 apr_array_header_t *linkattrs;
391 const char* name = (const char*) uname;
392 const char** attrs = (const char**) uattrs;
393 const htmlElemDesc* desc = htmlTagLookup(uname);
394 urlmap* themap = ctx->map;
399 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
402 if (!desc || desc->depr)
405 } else if ((ctx->cfg->doctype == fpi_html)
406 || (ctx->cfg->doctype == fpi_xhtml)) {
408 /* enforce html legacy */
413 if (!desc && enforce) {
414 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
415 "Bogus HTML element %s dropped", name) ;
418 if (desc && desc->depr && (enforce == 2) ) {
419 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
420 "Deprecated HTML element %s dropped", name) ;
424 descp = apr_array_push(ctx->stack);
426 /* TODO - implement HTML "allowed here" */
429 ap_fputc(ctx->f->next, ctx->bb, '<') ;
430 ap_fputs(ctx->f->next, ctx->bb, name) ;
433 if ((enforce > 0) && (desc != NULL) && (desc->attrs_req != NULL))
434 for (a = desc->attrs_req; *a; a++)
438 linkattrs = apr_hash_get(ctx->cfg->links, name, APR_HASH_KEY_STRING) ;
439 for ( a = attrs ; *a ; a += 2 ) {
440 if (desc && enforce > 0) {
441 switch (htmlAttrAllowed(desc, (xmlChar*)*a, 2-enforce)) {
443 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
444 "Bogus HTML attribute %s of %s dropped", *a, name);
446 case HTML_DEPRECATED:
447 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
448 "Deprecated HTML attribute %s of %s dropped", *a, name);
451 required_attrs--; /* cross off the number still needed */
452 /* fallthrough - required implies valid */
459 pappend(ctx, a[1], strlen(a[1])+1) ;
460 is_uri = ATTR_IGNORE ;
462 tattr* attrs = (tattr*) linkattrs->elts;
463 for (i=0; i < linkattrs->nelts; ++i) {
464 if ( !strcmp(*a, attrs[i].val)) {
470 if ( (is_uri == ATTR_IGNORE) && ctx->cfg->extfix
471 && (ctx->cfg->events != NULL) ) {
472 for (i=0; i < ctx->cfg->events->nelts; ++i) {
473 tattr* attrs = (tattr*) ctx->cfg->events->elts;
474 if ( !strcmp(*a, attrs[i].val)) {
475 is_uri = ATTR_EVENT ;
483 for ( m = themap ; m ; m = m->next ) {
484 if ( ! ( m->flags & M_HTML ) )
486 if ( m->flags & M_REGEX ) {
488 if ( ! ap_regexec(m->from.r, ctx->buf, nmatch, pmatch, 0) ) {
490 offs = match = pmatch[0].rm_so ;
491 s_from = pmatch[0].rm_eo - match ;
492 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf,
495 const char* f = apr_pstrndup(ctx->f->r->pool,
496 ctx->buf + offs , s_from ) ;
497 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
498 "H/RX: match at %s, substituting %s", f, subs) ;
500 s_to = strlen(subs) ;
501 len = strlen(ctx->buf) ;
502 if ( s_to > s_from) {
503 preserve(ctx, s_to - s_from) ;
504 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
505 len + 1 - s_from - offs) ;
506 memcpy(ctx->buf+offs, subs, s_to) ;
508 memcpy(ctx->buf + offs, subs, s_to) ;
509 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
510 len + 1 - s_from - offs) ;
514 s_from = strlen(m->from.c) ;
515 if ( ! strncasecmp(ctx->buf, m->from.c, s_from ) ) {
517 s_to = strlen(m->to) ;
518 len = strlen(ctx->buf) ;
519 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
520 "H: matched %s, substituting %s", m->from.c, m->to) ) ;
521 if ( s_to > s_from ) {
522 preserve(ctx, s_to - s_from) ;
523 memmove(ctx->buf+s_to, ctx->buf+s_from,
525 memcpy(ctx->buf, m->to, s_to) ;
526 } else { /* it fits in the existing space */
527 memcpy(ctx->buf, m->to, s_to) ;
528 memmove(ctx->buf+s_to, ctx->buf+s_from,
534 /* URIs only want one match unless overridden in the config */
535 if ( (num_match > 0) && !( m->flags & M_NOTLAST ) )
540 for ( m = themap ; m ; m = m->next ) {
541 num_match = 0 ; /* reset here since we're working per-rule */
542 if ( ! ( m->flags & M_EVENTS ) )
544 if ( m->flags & M_REGEX ) {
547 while ( ! ap_regexec(m->from.r, ctx->buf+offs,
548 nmatch, pmatch, 0) ) {
549 match = pmatch[0].rm_so ;
550 s_from = pmatch[0].rm_eo - match ;
551 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
554 const char* f = apr_pstrndup(ctx->f->r->pool,
555 ctx->buf + offs , s_from ) ;
556 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
557 "E/RX: match at %s, substituting %s", f, subs) ;
559 s_to = strlen(subs) ;
561 len = strlen(ctx->buf) ;
562 if ( s_to > s_from) {
563 preserve(ctx, s_to - s_from) ;
564 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
565 len + 1 - s_from - offs) ;
566 memcpy(ctx->buf+offs, subs, s_to) ;
568 memcpy(ctx->buf + offs, subs, s_to) ;
569 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
570 len + 1 - s_from - offs) ;
576 found = strstr(ctx->buf, m->from.c) ;
577 if ( (m->flags & M_ATSTART) && ( found != ctx->buf) )
580 s_from = strlen(m->from.c) ;
581 s_to = strlen(m->to) ;
582 match = found - ctx->buf ;
583 if ( ( s_from < strlen(found) ) && (m->flags & M_ATEND ) ) {
584 found = strstr(ctx->buf+match+s_from, m->from.c) ;
587 found = strstr(ctx->buf+match+s_to, m->from.c) ;
589 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r,
590 "E: matched %s, substituting %s", m->from.c, m->to) ) ;
591 len = strlen(ctx->buf) ;
592 if ( s_to > s_from ) {
593 preserve(ctx, s_to - s_from) ;
594 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
595 len + 1 - s_from - match) ;
596 memcpy(ctx->buf+match, m->to, s_to) ;
598 memcpy(ctx->buf+match, m->to, s_to) ;
599 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
600 len + 1 - s_from - match) ;
605 if ( num_match && ( m->flags & M_LAST ) )
614 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL) ;
617 if ( ctx->cfg->flags != 0 )
618 normalise(ctx->cfg->flags, ctx->buf) ;
620 /* write the attribute, using pcharacters to html-escape
621 anything that needs it in the value.
623 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL) ;
624 pcharacters(ctx, (const xmlChar*)ctx->buf, strlen(ctx->buf)) ;
625 ap_fputc(ctx->f->next, ctx->bb, '"') ;
630 if ( desc && desc->empty )
631 ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag) ;
633 ap_fputc(ctx->f->next, ctx->bb, '>') ;
635 if ((enforce > 0) && (required_attrs > 0)) {
636 /* if there are more required attributes than we found then complain */
637 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r,
638 "HTML element %s is missing %d required attributes",
639 name, required_attrs);
643 static meta* metafix(request_rec* r, const char* buf /*, size_t bytes*/
654 ap_regmatch_t pmatch[2] ;
657 while ( ! ap_regexec(seek_meta, buf+offs, 2, pmatch, 0) ) {
660 p = buf+offs+pmatch[1].rm_eo ;
661 while ( !isalpha(*++p) ) ;
662 for ( q = p ; isalnum(*q) || (*q == '-') ; ++q ) ;
663 header = apr_pstrndup(r->pool, p, q-p) ;
664 if ( strncasecmp(header, "Content-", 8) ) {
665 /* find content=... string */
666 p = apr_strmatch(seek_content, buf+offs+pmatch[0].rm_so,
667 pmatch[0].rm_eo - pmatch[0].rm_so);
668 /* if it doesn't contain "content", ignore, don't crash! */
672 while ( *p && isspace(*p) )
676 while ( *p && isspace(*++p) ) ;
677 if ( ( *p == '\'' ) || ( *p == '"' ) ) {
679 for ( q = p ; *q != delim ; ++q ) ;
681 for ( q = p ; *q && !isspace(*q) && (*q != '>') ; ++q ) ;
683 content = apr_pstrndup(r->pool, p, q-p) ;
687 } else if ( !strncasecmp(header, "Content-Type", 12) ) {
688 ret = apr_palloc(r->pool, sizeof(meta) ) ;
689 ret->start = pmatch[0].rm_so ;
690 ret->end = pmatch[0].rm_eo ;
692 if ( header && content ) {
693 VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
694 "Adding header [%s: %s] from HTML META", header, content) ) ;
695 apr_table_setn(r->headers_out, header, content) ;
697 offs += pmatch[0].rm_eo ;
702 static const char* interpolate_vars(request_rec* r, const char* str) {
708 const char* replacement;
712 if (start = ap_strstr_c(start, "${"), start == NULL)
715 if (end = ap_strchr_c(start+2, '}'), end == NULL)
718 delim = ap_strchr_c(start, '|');
719 before = apr_pstrndup(r->pool, str, start-str);
722 var = apr_pstrndup(r->pool, start+2, delim-start-2) ;
724 var = apr_pstrndup(r->pool, start+2, end-start-2) ;
726 replacement = apr_table_get(r->subprocess_env, var) ;
729 replacement = apr_pstrndup(r->pool, delim+1, end-delim-1);
733 str = apr_pstrcat(r->pool, before, replacement, after, NULL);
734 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
735 "Interpolating %s => %s", var, replacement) ;
739 static void fixup_rules(saxctxt* ctx) {
744 request_rec* r = ctx->f->r;
747 for (p = ctx->cfg->map; p; p = p->next) {
749 if (p->cond != NULL) {
750 thisval = apr_table_get(r->subprocess_env, p->cond->env);
752 /* required to be "anything" */
754 has_cond = 1; /* satisfied */
756 has_cond = 0; /* unsatisfied */
758 if (thisval && !strcasecmp(p->cond->val, thisval)) {
759 has_cond = 1; /* satisfied */
761 has_cond = 0; /* unsatisfied */
764 if (((has_cond == 0) && (p->cond->rel ==1 ))
765 || ((has_cond == 1) && (p->cond->rel == -1))) {
766 continue; /* condition is unsatisfied */
770 newp = apr_pmemdup(r->pool, p, sizeof(urlmap));
772 if (newp->flags & M_INTERPOLATE_FROM) {
773 newp->from.c = interpolate_vars(r, newp->from.c);
774 if (!newp->from.c || !*newp->from.c)
775 continue; /* don't use empty from-pattern */
776 if (newp->flags & M_REGEX) {
777 newp->from.r = ap_pregcomp(r->pool, newp->from.c, newp->regflags) ;
780 if (newp->flags & M_INTERPOLATE_TO) {
781 newp->to = interpolate_vars(r, newp->to);
783 /* evaluate p->cond; continue if unsatisfied */
784 /* create new urlmap with memcpy and append to map */
785 /* interpolate from if flagged to do so */
786 /* interpolate to if flagged to do so */
798 static saxctxt* check_filter_init (ap_filter_t* f) {
802 = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
803 const char* force = apr_table_get(f->r->subprocess_env, "PROXY_HTML_FORCE");
805 const char* errmsg = NULL ;
807 if ( ! f->r->proxyreq ) {
808 errmsg = "Non-proxy request; not inserting proxy-html filter" ;
809 } else if ( ! f->r->content_type ) {
810 errmsg = "No content-type; bailing out of proxy-html filter" ;
811 } else if ( strncasecmp(f->r->content_type, "text/html", 9) &&
812 strncasecmp(f->r->content_type, "application/xhtml+xml", 21) ) {
813 errmsg = "Non-HTML content; not inserting proxy-html filter" ;
817 errmsg = "No links configured: nothing for proxy-html filter to do";
822 if ( cfg->verbose ) {
823 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, f->r, "%s", errmsg) ;
826 ap_remove_output_filter(f) ;
830 fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt)) ;
832 fctx->bb = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc) ;
834 apr_table_unset(f->r->headers_out, "Content-Length") ;
839 fctx->map = cfg->map;
840 /* defer dealing with charset_out until after sniffing charset_in
841 * so we can support setting one to t'other.
846 static int proxy_html_filter(ap_filter_t* f, apr_bucket_brigade* bb) {
849 xmlCharEncoding enc ;
850 const char* buf = 0 ;
851 apr_size_t bytes = 0 ;
852 #ifndef USE_OLD_LIBXML2
853 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
854 XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING ;
857 saxctxt* ctxt = check_filter_init(f) ;
862 return ap_pass_brigade(f->next, bb) ;
864 verbose = ctxt->cfg->verbose;
867 for ( b = APR_BRIGADE_FIRST(bb) ;
868 b != APR_BRIGADE_SENTINEL(bb) ;
869 b = APR_BUCKET_NEXT(b) ) {
870 if ( APR_BUCKET_IS_METADATA(b) ) {
871 if ( APR_BUCKET_IS_EOS(b) ) {
872 if ( ctxt->parser != NULL ) {
873 consume_buffer(ctxt, buf, 0, 1);
875 APR_BRIGADE_INSERT_TAIL(ctxt->bb,
876 apr_bucket_eos_create(ctxt->bb->bucket_alloc) ) ;
877 ap_pass_brigade(ctxt->f->next, ctxt->bb) ;
878 } else if ( APR_BUCKET_IS_FLUSH(b) ) {
879 /* pass on flush, except at start where it would cause
880 * headers to be sent before doc sniffing
882 if ( ctxt->parser != NULL ) {
883 ap_fflush(ctxt->f->next, ctxt->bb) ;
886 } else if ( apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
888 if ( ctxt->parser == NULL ) {
890 if (!xml2enc_charset ||
891 (xml2enc_charset(f->r, &enc, &cenc) != APR_SUCCESS)) {
892 if (!xml2enc_charset)
893 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
894 "No i18n support found. Install mod_xml2enc if required") ;
895 enc = XML_CHAR_ENCODING_NONE;
896 ap_set_content_type(f->r, "text/html;charset=utf-8") ;
898 /* if we wanted a non-default charset_out, insert the
899 * xml2enc filter now that we've sniffed it
901 if (ctxt->cfg->charset_out && xml2enc_filter) {
902 if (*ctxt->cfg->charset_out != '*')
903 cenc = ctxt->cfg->charset_out;
904 xml2enc_filter(f->r, cenc, ENCIO_OUTPUT);
905 ap_set_content_type(f->r,
906 apr_pstrcat(f->r->pool, "text/html;charset=", cenc, NULL)) ;
907 } else /* Normal case, everything worked, utf-8 output */
908 ap_set_content_type(f->r, "text/html;charset=utf-8") ;
911 ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype) ;
912 ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, buf, 4, 0, enc) ;
915 if (ctxt->parser == NULL) {
916 apr_status_t rv = ap_pass_brigade(f->next, bb) ;
917 ap_remove_output_filter(f) ;
920 apr_pool_cleanup_register(f->r->pool, ctxt->parser,
921 (int(*)(void*))htmlFreeParserCtxt, apr_pool_cleanup_null) ;
922 #ifndef USE_OLD_LIBXML2
923 if ( xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts ), xmlopts )
924 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r,
925 "Unsupported parser opts %x", xmlopts) ;
927 if ( ctxt->cfg->metafix )
929 m = metafix(f->r, buf, ctxt->cfg->verbose) ;
931 m = metafix(f->r, buf) ;
934 consume_buffer(ctxt, buf, m->start, 0) ;
935 consume_buffer(ctxt, buf+m->end, bytes-m->end, 0) ;
937 consume_buffer(ctxt, buf, bytes, 0) ;
940 consume_buffer(ctxt, buf, bytes, 0) ;
943 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, "Error in bucket read") ;
946 /*ap_fflush(ctxt->f->next, ctxt->bb) ; // uncomment for debug */
947 apr_brigade_cleanup(bb) ;
951 static void* proxy_html_config(apr_pool_t* pool, char* x) {
952 proxy_html_conf* ret = apr_pcalloc(pool, sizeof(proxy_html_conf) ) ;
953 ret->doctype = DEFAULT_DOCTYPE ;
954 ret->etag = DEFAULT_ETAG ;
956 /* ret->interp = 1; */
957 /* don't initialise links and events until they get set/used */
960 static void* proxy_html_merge(apr_pool_t* pool, void* BASE, void* ADD) {
961 proxy_html_conf* base = (proxy_html_conf*) BASE ;
962 proxy_html_conf* add = (proxy_html_conf*) ADD ;
963 proxy_html_conf* conf = apr_palloc(pool, sizeof(proxy_html_conf)) ;
965 /* don't merge declarations - just use the most specific */
966 conf->links = (add->links == NULL) ? base->links : add->links;
967 conf->events = (add->events == NULL) ? base->events : add->events;
969 conf->charset_out = (add->charset_out == NULL)
970 ? base->charset_out : add->charset_out ;
972 if ( add->map && base->map ) {
975 for ( a = base->map ; a ; a = a->next ) {
976 urlmap* save = conf->map ;
977 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
978 conf->map->next = save ;
980 for ( a = add->map ; a ; a = a->next ) {
981 urlmap* save = conf->map ;
982 conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ;
983 conf->map->next = save ;
986 conf->map = add->map ? add->map : base->map ;
988 conf->doctype = ( add->doctype == DEFAULT_DOCTYPE )
989 ? base->doctype : add->doctype ;
990 conf->etag = ( add->etag == DEFAULT_ETAG ) ? base->etag : add->etag ;
991 conf->bufsz = add->bufsz ;
992 if ( add->flags & NORM_RESET ) {
993 conf->flags = add->flags ^ NORM_RESET ;
994 conf->metafix = add->metafix ;
995 conf->extfix = add->extfix ;
996 conf->interp = add->interp ;
997 conf->strip_comments = add->strip_comments ;
998 conf->enabled = add->enabled;
1000 conf->verbose = add->verbose ;
1003 conf->flags = base->flags | add->flags ;
1004 conf->metafix = base->metafix | add->metafix ;
1005 conf->extfix = base->extfix | add->extfix ;
1006 conf->interp = base->interp | add->interp ;
1007 conf->strip_comments = base->strip_comments | add->strip_comments ;
1008 conf->enabled = add->enabled | base->enabled;
1010 conf->verbose = base->verbose | add->verbose ;
1015 #define REGFLAG(n,s,c) ( (s&&(ap_strchr_c((s),(c))!=NULL)) ? (n) : 0 )
1016 #define XREGFLAG(n,s,c) ( (!s||(ap_strchr_c((s),(c))==NULL)) ? (n) : 0 )
1017 static void comp_urlmap(apr_pool_t* pool, urlmap* newmap,
1018 const char* from, const char* to, const char* flags, const char* cond) {
1021 = XREGFLAG(M_HTML,flags,'h')
1022 | XREGFLAG(M_EVENTS,flags,'e')
1023 | XREGFLAG(M_CDATA,flags,'c')
1024 | REGFLAG(M_ATSTART,flags,'^')
1025 | REGFLAG(M_ATEND,flags,'$')
1026 | REGFLAG(M_REGEX,flags,'R')
1027 | REGFLAG(M_LAST,flags,'L')
1028 | REGFLAG(M_NOTLAST,flags,'l')
1029 | REGFLAG(M_INTERPOLATE_TO,flags,'V')
1030 | REGFLAG(M_INTERPOLATE_FROM,flags,'v')
1032 if ( ( newmap->flags & M_INTERPOLATE_FROM)
1033 || ! (newmap->flags & M_REGEX) ) {
1034 newmap->from.c = from ;
1038 = REGFLAG(AP_REG_EXTENDED,flags,'x')
1039 | REGFLAG(AP_REG_ICASE,flags,'i')
1040 | REGFLAG(AP_REG_NOSUB,flags,'n')
1041 | REGFLAG(AP_REG_NEWLINE,flags,'s')
1043 newmap->from.r = ap_pregcomp(pool, from, newmap->regflags) ;
1048 newmap->cond = apr_pcalloc(pool, sizeof(rewritecond));
1049 if (cond[0] == '!') {
1050 newmap->cond->rel = -1;
1051 newmap->cond->env = cond_copy = apr_pstrdup(pool, cond+1);
1053 newmap->cond->rel = 1;
1054 newmap->cond->env = cond_copy = apr_pstrdup(pool, cond);
1056 eq = ap_strchr(++cond_copy, '=');
1059 newmap->cond->val = eq+1;
1062 newmap->cond = NULL;
1065 static const char* set_urlmap(cmd_parms* cmd, void* CFG, const char* args) {
1066 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
1068 apr_pool_t* pool = cmd->pool;
1071 "Usage: ProxyHTMLURLMap from-pattern to-pattern [flags] [cond]";
1075 const char* cond = NULL;
1077 if (from = ap_getword_conf(cmd->pool, &args), !from)
1079 if (to = ap_getword_conf(cmd->pool, &args), !to)
1081 flags = ap_getword_conf(cmd->pool, &args);
1082 if (flags && *flags)
1083 cond = ap_getword_conf(cmd->pool, &args);
1087 /* the args look OK, so let's use them */
1088 newmap = apr_palloc(pool, sizeof(urlmap) ) ;
1089 newmap->next = NULL;
1091 for ( map = cfg->map ; map->next ; map = map->next ) ;
1092 map->next = newmap ;
1096 comp_urlmap(cmd->pool, newmap, from, to, flags, cond);
1100 static const char* set_doctype(cmd_parms* cmd, void* CFG, const char* t,
1102 proxy_html_conf* cfg = (proxy_html_conf*)CFG ;
1103 if ( !strcasecmp(t, "xhtml") ) {
1104 cfg->etag = xhtml_etag ;
1105 if ( l && !strcasecmp(l, "legacy") )
1106 cfg->doctype = fpi_xhtml_legacy ;
1108 cfg->doctype = fpi_xhtml ;
1109 } else if ( !strcasecmp(t, "html") ) {
1110 cfg->etag = html_etag ;
1111 if ( l && !strcasecmp(l, "legacy") )
1112 cfg->doctype = fpi_html_legacy ;
1114 cfg->doctype = fpi_html ;
1116 cfg->doctype = apr_pstrdup(cmd->pool, t) ;
1117 if ( l && ( ( l[0] == 'x' ) || ( l[0] == 'X' ) ) )
1118 cfg->etag = xhtml_etag ;
1120 cfg->etag = html_etag ;
1124 static const char* set_flags(cmd_parms* cmd, void* CFG, const char* arg) {
1125 proxy_html_conf* cfg = CFG;
1126 if ( arg && *arg ) {
1127 if ( !strcmp(arg, "lowercase") )
1128 cfg->flags |= NORM_LC ;
1129 else if ( !strcmp(arg, "dospath") )
1130 cfg->flags |= NORM_MSSLASH ;
1131 else if ( !strcmp(arg, "reset") )
1132 cfg->flags |= NORM_RESET ;
1136 static const char* set_events(cmd_parms* cmd, void* CFG, const char* arg) {
1138 proxy_html_conf* cfg = CFG;
1139 if (cfg->events == NULL)
1140 cfg->events = apr_array_make(cmd->pool, 20, sizeof(tattr));
1141 attr = apr_array_push(cfg->events) ;
1145 static const char* set_links(cmd_parms* cmd, void* CFG,
1146 const char* elt, const char* att) {
1147 apr_array_header_t* attrs;
1149 proxy_html_conf* cfg = CFG;
1151 if (cfg->links == NULL)
1152 cfg->links = apr_hash_make(cmd->pool);
1154 attrs = apr_hash_get(cfg->links, elt, APR_HASH_KEY_STRING) ;
1156 attrs = apr_array_make(cmd->pool, 2, sizeof(tattr*)) ;
1157 apr_hash_set(cfg->links, elt, APR_HASH_KEY_STRING, attrs) ;
1159 attr = apr_array_push(attrs) ;
1163 static const command_rec proxy_html_cmds[] = {
1164 AP_INIT_ITERATE("ProxyHTMLEvents", set_events, NULL,
1165 RSRC_CONF|ACCESS_CONF, "Strings to be treated as scripting events"),
1166 AP_INIT_ITERATE2("ProxyHTMLLinks", set_links, NULL,
1167 RSRC_CONF|ACCESS_CONF, "Declare HTML Attributes"),
1168 AP_INIT_RAW_ARGS("ProxyHTMLURLMap", set_urlmap, NULL,
1169 RSRC_CONF|ACCESS_CONF, "Map URL From To" ) ,
1170 AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1171 RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]" ) ,
1172 AP_INIT_ITERATE("ProxyHTMLFixups", set_flags, NULL,
1173 RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath" ) ,
1174 AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1175 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1176 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements" ) ,
1177 AP_INIT_FLAG("ProxyHTMLInterp", ap_set_flag_slot,
1178 (void*)APR_OFFSETOF(proxy_html_conf, interp),
1179 RSRC_CONF|ACCESS_CONF,
1180 "Support interpolation and conditions in URLMaps" ) ,
1181 AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1182 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1183 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS" ) ,
1184 AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1185 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1186 RSRC_CONF|ACCESS_CONF, "Strip out comments" ) ,
1188 AP_INIT_FLAG("ProxyHTMLLogVerbose", ap_set_flag_slot,
1189 (void*)APR_OFFSETOF(proxy_html_conf, verbose),
1190 RSRC_CONF|ACCESS_CONF, "Verbose Logging (use with LogLevel Info)" ) ,
1192 AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1193 (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1194 RSRC_CONF|ACCESS_CONF, "Buffer size" ) ,
1195 AP_INIT_TAKE1("ProxyHTMLCharsetOut", ap_set_string_slot,
1196 (void*)APR_OFFSETOF(proxy_html_conf, charset_out),
1197 RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetOut charset" ) ,
1198 AP_INIT_FLAG("ProxyHTMLEnable", ap_set_flag_slot,
1199 (void*)APR_OFFSETOF(proxy_html_conf, enabled),
1200 RSRC_CONF|ACCESS_CONF, "Enable proxy-html and xml2enc filters" ) ,
1203 static int mod_proxy_html(apr_pool_t* p, apr_pool_t* p1, apr_pool_t* p2,
1205 ap_add_version_component(p, VERSION_STRING) ;
1206 seek_meta = ap_pregcomp(p, "<meta[^>]*(http-equiv)[^>]*>",
1207 AP_REG_EXTENDED|AP_REG_ICASE) ;
1208 seek_content = apr_strmatch_precompile(p, "content", 0);
1209 memset(&sax, 0, sizeof(htmlSAXHandler));
1210 sax.startElement = pstartElement ;
1211 sax.endElement = pendElement ;
1212 sax.characters = pcharacters ;
1213 sax.comment = pcomment ;
1214 sax.cdataBlock = pcdata ;
1215 xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset);
1216 xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter);
1217 if (!xml2enc_charset) {
1218 ap_log_perror(APLOG_MARK, APLOG_NOTICE, 0, p2,
1219 "I18n support in mod_proxy_html requires mod_xml2enc. "
1220 "Without it, non-ASCII characters in proxied pages are "
1221 "likely to display incorrectly.");
1225 static void proxy_html_insert(request_rec* r) {
1226 proxy_html_conf* cfg
1227 = ap_get_module_config(r->per_dir_config, &proxy_html_module);
1230 xml2enc_filter(r, NULL, ENCIO_INPUT_CHECKS);
1231 ap_add_output_filter("proxy-html", NULL, r, r->connection);
1234 static void proxy_html_hooks(apr_pool_t* p) {
1235 static const char* aszSucc[] = { "mod_filter.c", NULL };
1236 ap_register_output_filter_protocol("proxy-html", proxy_html_filter,
1237 NULL, AP_FTYPE_RESOURCE,
1238 AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH) ;
1239 ap_hook_post_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE) ;
1240 ap_hook_insert_filter(proxy_html_insert, NULL, aszSucc, APR_HOOK_MIDDLE) ;
1242 module AP_MODULE_DECLARE_DATA proxy_html_module = {
1243 STANDARD20_MODULE_STUFF,