From 1c8f801cbb8ac75252face3a31f6dbfa02878ceb Mon Sep 17 00:00:00 2001 From: Emmanuel Lacour Date: Sat, 21 Nov 2009 01:25:31 +0100 Subject: [PATCH 1/1] Intitialize packaging of mod_xml2enc --- debian/changelog | 6 + debian/compat | 1 + debian/conf/xml2enc.load | 2 + debian/control | 17 ++ debian/copyright | 28 +++ debian/dirs | 2 + debian/install | 2 + debian/postinst | 68 ++++++ debian/prerm | 51 ++++ debian/rules | 51 ++++ mod_xml2enc.c | 622 +++++++++++++++++++++++++++++++++++++++++++++++ mod_xml2enc.h | 39 +++ 12 files changed, 889 insertions(+) create mode 100644 debian/changelog create mode 100644 debian/compat create mode 100644 debian/conf/xml2enc.load create mode 100644 debian/control create mode 100644 debian/copyright create mode 100644 debian/dirs create mode 100644 debian/install create mode 100644 debian/postinst create mode 100644 debian/prerm create mode 100755 debian/rules create mode 100644 mod_xml2enc.c create mode 100644 mod_xml2enc.h diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..ed4ecca --- /dev/null +++ b/debian/changelog @@ -0,0 +1,6 @@ +mod-xml2enc (1.0.3-1) unstable; urgency=low + + * Initial Release, closes: #FIXME + + -- Emmanuel Lacour Sat, 21 Nov 2009 01:06:37 +0100 + diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..b8626c4 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +4 diff --git a/debian/conf/xml2enc.load b/debian/conf/xml2enc.load new file mode 100644 index 0000000..9c88f46 --- /dev/null +++ b/debian/conf/xml2enc.load @@ -0,0 +1,2 @@ +LoadFile /usr/lib/libxml2.so.2 +LoadModule xml2enc_module /usr/lib/apache2/modules/mod_xml2enc.so diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..7768c35 --- /dev/null +++ b/debian/control @@ -0,0 +1,17 @@ +Source: mod-xml2enc +Section: web +Priority: optional +Maintainer: Emmanuel Lacour +Build-Depends: debhelper (>= 4.0.0), apache2-prefork-dev (>> 2.2), libxml2-dev (>> 2.5.10) +Standards-Version: 3.8.0 + +Package: libapache2-mod-xml2enc +Architecture: any +Depends: ${shlibs:Depends}, apache2, apache2.2-common, libxml2 (>> 2.5.10) +Description: Apache2 transcoding module based on libxml2 + mod_xml2enc is a transcoding module that can be used to extend the + internationalisation support of libxml2-based filter modules by converting + encoding before and/or after the filter has run. Thus an unsupported input + charset can be converted to UTF-8, and output can also be converted to another + charset if required. + diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..8b7040b --- /dev/null +++ b/debian/copyright @@ -0,0 +1,28 @@ +This package was debianized by Emmanuel Lacour on +Sat, 21 Nov 2009 01:11:09 +0100 + +It was downloaded from http://apache.webthing.com/mod_xml2enc/ + +Upstream Author: Nick Kew + +Copyright (c) 2007-8, WebThing Ltd + +License: + +FIXME: should we distribute this with both apache2/GPL as upstream ? + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + +see /usr/share/common-licenses/GPL for all details. diff --git a/debian/dirs b/debian/dirs new file mode 100644 index 0000000..59d28be --- /dev/null +++ b/debian/dirs @@ -0,0 +1,2 @@ +/usr/lib/apache2/modules +/etc/apache2/mods-available diff --git a/debian/install b/debian/install new file mode 100644 index 0000000..56bd189 --- /dev/null +++ b/debian/install @@ -0,0 +1,2 @@ +debian/conf/xml2enc.load /etc/apache2/mods-available/ +.libs/mod_xml2enc.so /usr/lib/apache2/modules/ diff --git a/debian/postinst b/debian/postinst new file mode 100644 index 0000000..77877d1 --- /dev/null +++ b/debian/postinst @@ -0,0 +1,68 @@ +#! /bin/sh +# postinst script for libapache2-mod-xml2enc +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * `configure' +# * `abort-upgrade' +# * `abort-remove' `in-favour' +# +# * `abort-deconfigure' `in-favour' +# `removing' +# +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package +# + +reload_apache() +{ + if apache2ctl configtest 2>/dev/null; then + invoke-rc.d apache2 force-reload || true + else + echo "Your apache2 configuration is broken, so we're not restarting it for you." + fi +} + + +case "$1" in + configure) + # Reload the module on upgrade if enabled + if [ -n "$2" ]; then + if [ -e /etc/apache2/mods-enabled/xml2enc.load ]; then + # We must reenable this module to enable the new configuration file + if dpkg --compare-versions "$2" lt "3.0.0-1" ; then + a2dismod xml2enc >/dev/null || true + a2enmod xml2enc >/dev/null || true + fi + reload_apache + fi + else + # Enable the module + if [ -e /etc/apache2/apache2.conf ]; then + a2enmod xml2enc >/dev/null || true + reload_apache + fi + fi + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 + + diff --git a/debian/prerm b/debian/prerm new file mode 100644 index 0000000..44f4a3d --- /dev/null +++ b/debian/prerm @@ -0,0 +1,51 @@ +#! /bin/sh +# prerm script for libapache2-mod-xml2enc +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * `remove' +# * `upgrade' +# * `failed-upgrade' +# * `remove' `in-favour' +# * `deconfigure' `in-favour' +# `removing' +# +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + +reload_apache() +{ + if apache2ctl configtest 2>/dev/null; then + invoke-rc.d apache2 force-reload || true + else + echo "Your apache2 configuration is broken, so we're not restarting it for you." + fi +} + + +case "$1" in + remove) + if [ -e /etc/apache2/mods-enabled/xml2enc.load ]; then + a2dismod xml2enc >/dev/null || true + reload_apache + fi + ;; + upgrade|failed-upgrade|deconfigure) + ;; + *) + echo "prerm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 + + diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..5b6d2e9 --- /dev/null +++ b/debian/rules @@ -0,0 +1,51 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 + +build: build-stamp + +build-stamp: + dh_testdir + apxs2 -c -I/usr/include/libxml2/ mod_xml2enc.c + touch build-stamp + +clean: + dh_testdir + dh_testroot + rm -f build-stamp mod_xml2enc.la mod_xml2enc.lo mod_xml2enc.o mod_xml2enc.slo + rm -rf .libs + dh_clean + +install: build + dh_testdir + dh_testroot + dh_clean -k + dh_installdirs + dh_install + +binary-indep: + +binary-arch: build install + dh_testdir + dh_testroot + dh_installchangelogs + dh_link + dh_strip + dh_compress + dh_fixperms + dh_makeshlibs + dh_installdeb + dh_shlibdeps + dh_gencontrol + dh_md5sums + dh_builddeb + +binary: binary-arch +.PHONY: build clean binary-arch binary-indep binary install diff --git a/mod_xml2enc.c b/mod_xml2enc.c new file mode 100644 index 0000000..2370c4a --- /dev/null +++ b/mod_xml2enc.c @@ -0,0 +1,622 @@ +/******************************************************************** + Copyright (c) 2007-8, WebThing Ltd + Author: Nick Kew + + * This work is available to you under EITHER the Apache License Version 2.0 + * OR the GNU General Poblic License Version 2. It is your choice which + * of these licenses you accept, but if you wish to copy or use this + * work, you MUST accept one of these licenses and abide by its terms. + * + * + * + * OPTION 1: Apache License + * WebThing licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + * + * OPTION 2: GNU General Public License + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License Version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You can obtain a copy of the GNU General Poblic License Version 2 + * from http://www.gnu.org/licenses/old-licenses/gpl-2.0.html or + * http://apache.webthing.com/COPYING.txt + +**********************************************************************/ + +/* Version 1.0.3 - Bugfix against crash on no-content-type response + * reaching the filter function + */ + +#if defined(WIN32) +#define XML2ENC_DECLARE_EXPORT +#endif + +#include + +/* libxml2 */ +#include + +/* apache */ +#include +#include +#include +#include +#include + +#include +#include "mod_xml2enc.h" + +/* Apache 2.0 isn't really supported, but "should work" with these #defines. */ +#ifndef AP_REG_ICASE +/* it's 2.0, so we #define the ap_ versions */ +#define ap_regex_t regex_t +#define ap_regmatch_t regmatch_t +#define AP_REG_EXTENDED REG_EXTENDED +#define AP_REG_ICASE REG_ICASE +#define AP_REG_NOSUB REG_NOSUB +#define AP_REG_NEWLINE REG_NEWLINE +#define APACHE20 +#define ap_register_output_filter_protocol(a,b,c,d,e) ap_register_output_filter(a,b,c,d) +#else +#define APACHE22 +#endif + +module AP_MODULE_DECLARE_DATA xml2enc_module; + +#define BUFLEN 8192 +#define BUF_MIN 4096 +#define APR_BRIGADE_DO(b,bb) for (b = APR_BRIGADE_FIRST(bb); \ + b != APR_BRIGADE_SENTINEL(bb); b = APR_BUCKET_NEXT(b)) + +#define ENC_INITIALISED 0x100 +#define ENC_SEEN_EOS 0x200 +#define ENC_SKIPTO ENCIO_SKIPTO + +#define HAVE_ENCODING(enc) \ + (((enc)!=XML_CHAR_ENCODING_NONE)&&((enc)!=XML_CHAR_ENCODING_ERROR)) + +typedef struct { + xmlCharEncoding xml2enc; + char* buf; + apr_size_t bytes; + apr_xlate_t* convset; + unsigned int flags; + apr_off_t bblen; + apr_bucket_brigade* bbnext; + apr_bucket_brigade* bbsave; + const char* encoding; +} xml2ctx; + +typedef struct { + const char* default_charset; + xmlCharEncoding default_encoding; + apr_array_header_t* skipto; +} xml2cfg; + +typedef struct { + const char* val; +} tattr; + +static ap_regex_t* seek_meta_ctype; +static ap_regex_t* seek_charset; + +static apr_status_t xml2enc_filter(request_rec* r, const char* enc, + unsigned int mode) { + /* set up a ready-initialised ctx to convert to enc, and insert filter */ + apr_xlate_t* convset; + apr_status_t rv; + unsigned int flags = (mode ^ ENCIO); + if ((mode & ENCIO) == ENCIO_OUTPUT) { + rv = apr_xlate_open(&convset, enc, "UTF-8", r->pool); + flags |= ENC_INITIALISED; + } else if ((mode & ENCIO) == ENCIO_INPUT) { + rv = apr_xlate_open(&convset, "UTF-8", enc, r->pool); + flags |= ENC_INITIALISED; + } else if ((mode & ENCIO) == ENCIO_INPUT_CHECKS) { + convset = NULL; + rv = APR_SUCCESS; /* we'll initialise later by sniffing */ + } else { + rv = APR_EGENERAL; + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, "xml2enc: bad mode %x", mode); + } + if (rv == APR_SUCCESS) { + xml2ctx* ctx = apr_pcalloc(r->pool, sizeof(xml2ctx)); + ctx->flags = flags; + if (flags & ENC_INITIALISED) { + ctx->convset = convset; + ctx->bblen = BUFLEN; + ctx->buf = apr_palloc(r->pool, (apr_size_t)ctx->bblen); + } + ap_add_output_filter("xml2enc", ctx, r, r->connection); + } else { + ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, + "xml2enc: Charset %s not supported.", enc) ; + } + return rv; +} + +/* This needs to operate only when we're using htmlParser */ +/* Different modules may apply different rules here. Ho, hum. */ +static void fix_skipto(request_rec* r, xml2ctx* ctx) { + apr_status_t rv; + xml2cfg* cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module); + if ((cfg->skipto != NULL) && (ctx->flags | ENC_SKIPTO)) { + int found = 0; + char* p = ap_strchr(ctx->buf, '<'); + tattr* starts = (tattr*) cfg->skipto->elts; + while (!found && p && *p) { + int i; + for (i = 0; i < cfg->skipto->nelts; ++i) { + if (!strncasecmp(p+1, starts[i].val, strlen(starts[i].val))) { + /* found a starting element. Strip all that comes before. */ + apr_bucket* b; + apr_bucket* bstart; + rv = apr_brigade_partition(ctx->bbsave, (p-ctx->buf), &bstart); + while (b = APR_BRIGADE_FIRST(ctx->bbsave), b != bstart) { + APR_BUCKET_REMOVE(b); + apr_bucket_destroy(b); + } + ctx->bytes -= (p-ctx->buf); + ctx->buf = p ; + found = 1; + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, + "Skipped to first <%s> element", starts[i].val) ; + break; + } + } + p = ap_strchr(p+1, '<'); + } + if (p == NULL) { + ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, + "Failed to find start of recognised HTML!") ; + } + } +} +static void sniff_encoding(request_rec* r, xml2ctx* ctx) { + xml2cfg* cfg = NULL; /* initialise to shut compiler warnings up */ + char* p ; + apr_bucket* cutb; + apr_bucket* cute; + apr_bucket* b; + ap_regmatch_t match[2] ; + apr_status_t rv; + const char* ctype = r->content_type; + + if (ctype) { + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, "Content-Type is %s", ctype) ; + +/* If we've got it in the HTTP headers, there's nothing to do */ + if (ctype && (p = ap_strcasestr(ctype, "charset=") , p != NULL)) { + p += 8 ; + if (ctx->encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ), ctx->encoding) { + ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, + "Got charset %s from HTTP headers", ctx->encoding) ; + ctx->xml2enc = xmlParseCharEncoding(ctx->encoding); + } + } + } + +/* to sniff, first we look for BOM */ + if (ctx->xml2enc == XML_CHAR_ENCODING_NONE) { + ctx->xml2enc = xmlDetectCharEncoding((const xmlChar*)ctx->buf, ctx->bytes); + if (HAVE_ENCODING(ctx->xml2enc)) { + ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, + "Got charset from XML rules.") ; + ctx->encoding = xmlGetCharEncodingName(ctx->xml2enc); + } + } + +/* If none of the above, look for a META-thingey */ +/* also we're probably about to invalidate it, so we remove it. */ + if ( ap_regexec(seek_meta_ctype, ctx->buf, 1, match, 0) == 0 ) { + /* get markers on the start and end of the match */ + rv = apr_brigade_partition(ctx->bbsave, match[0].rm_eo, &cute); + rv = apr_brigade_partition(ctx->bbsave, match[0].rm_so, &cutb); + /* now set length of useful buf for start-of-data hooks */ + ctx->bytes = match[0].rm_so; + if (ctx->encoding == NULL) { + p = apr_pstrndup(r->pool, ctx->buf + match[0].rm_so, + match[0].rm_eo - match[0].rm_so) ; + if ( ap_regexec(seek_charset, p, 2, match, 0) == 0 ) { + if (ctx->encoding = apr_pstrndup(r->pool, p+match[1].rm_so, + match[1].rm_eo - match[1].rm_so), ctx->encoding) { + ctx->xml2enc = xmlParseCharEncoding(ctx->encoding); + if (HAVE_ENCODING(ctx->xml2enc)) + ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, + "Got charset %s from HTML META", ctx->encoding) ; + } + } + } + + /* cut out the we're invalidating */ + while (cutb != cute) { + b = APR_BUCKET_NEXT(cutb); + APR_BUCKET_REMOVE(cutb); + apr_bucket_destroy(cutb); + cutb = b; + } + /* and leave a string */ + ctx->buf[ctx->bytes] = 0; + } + +/* either it's set to something we found or it's still the default */ +/* Aaargh! libxml2 has undocumented support. So this fails + * if metafix is not active. Have to make it conditional. + * + * No, that means no-metafix breaks things. Deal immediately with + * this particular instance of metafix. + */ + if (!HAVE_ENCODING(ctx->xml2enc)) { + cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module); + if (!ctx->encoding) { + ctx->encoding = cfg->default_charset?cfg->default_charset:"ISO-8859-1"; + } +/* Unsupported charset. Can we get (iconv) support through apr_xlate? */ + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, + "Charset %s not supported by libxml2; trying apr_xlate", ctx->encoding); + if (apr_xlate_open(&ctx->convset, "UTF-8", ctx->encoding, r->pool) == APR_SUCCESS) { + ctx->xml2enc = XML_CHAR_ENCODING_UTF8 ; + } else { + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, + "Charset %s not supported. Consider aliasing it?", ctx->encoding) ; + } + } + + if (!HAVE_ENCODING(ctx->xml2enc)) { + /* Use configuration default as a last resort */ + ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, + "No usable charset information; using configuration default") ; + ctx->xml2enc = (cfg->default_encoding == XML_CHAR_ENCODING_NONE) + ? XML_CHAR_ENCODING_8859_1 : cfg->default_encoding ; + } + if (ctype && ctx->encoding) { + if (ap_regexec(seek_charset, ctype, 2, match, 0)) { + r->content_type = apr_pstrcat(r->pool, ctype, ";charset=utf-8", NULL); + } else { + char* str = apr_palloc(r->pool, strlen(r->content_type) + + 13 - (match[0].rm_eo - match[0].rm_so) + 1); + memcpy(str, r->content_type, match[1].rm_so); + //memcpy(str + match[1].rm_so, "charset=utf-8", 5); + memcpy(str + match[1].rm_so, "utf-8", 5); + strcpy(str + match[1].rm_so + 5, r->content_type+match[1].rm_eo); + r->content_type = str; + } + } +} + +static apr_status_t xml2enc_filter_init(ap_filter_t* f) { + xml2ctx* ctx; + if (!f->ctx) { + xml2cfg* cfg = ap_get_module_config(f->r->per_dir_config, &xml2enc_module); + f->ctx = ctx = apr_pcalloc(f->r->pool, sizeof(xml2ctx)); + ctx->xml2enc = XML_CHAR_ENCODING_NONE; + if (cfg->skipto != NULL) { + ctx->flags |= ENC_SKIPTO; + } + } + return APR_SUCCESS; +} +static apr_status_t xml2enc_ffunc(ap_filter_t* f, apr_bucket_brigade* bb) { + xml2ctx* ctx = f->ctx; + apr_status_t rv; + apr_bucket* b; + apr_bucket* bstart; + apr_size_t insz = 0; + char *ctype; + char *p; + + if (!ctx || !f->r->content_type) { + /* log error about configuring this */ + ap_remove_output_filter(f); + return ap_pass_brigade(f->next, bb) ; + } + + ctype = apr_pstrdup(f->r->pool, f->r->content_type); + for (p = ctype; *p; ++p) + if (isupper(*p)) + *p = tolower(*p); + + /* only act if starts-with "text/" or contains "xml" */ + if (strncmp(ctype, "text/", 5) && !strstr(ctype, "xml")) { + ap_remove_output_filter(f); + return ap_pass_brigade(f->next, bb) ; + } + + if (ctx->bbsave == NULL) { + ctx->bbsave = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc); + } + /* append to any data left over from last time */ + APR_BRIGADE_CONCAT(ctx->bbsave, bb); + + if (!(ctx->flags & ENC_INITIALISED)) { + /* some kind of initialisation required */ + /* Turn all this off when post-processing */ + + /* if we don't have enough data to sniff but more's to come, wait for it */ + rv = apr_brigade_length(ctx->bbsave, 0, &ctx->bblen); + if ((ctx->bblen < BUF_MIN) && (ctx->bblen != -1)) { + APR_BRIGADE_DO(b, ctx->bbsave) { + if (APR_BUCKET_IS_EOS(b)) { + ctx->flags |= ENC_SEEN_EOS; + break; + } + } + if (!(ctx->flags & ENC_SEEN_EOS)) { + /* not enough data to sniff. Wait for more */ + APR_BRIGADE_DO(b, ctx->bbsave) { + apr_bucket_setaside(b, f->r->pool); + } + return APR_SUCCESS; + } + } + if (ctx->bblen == -1) { + ctx->bblen = BUFLEN-1; + } + /* flatten it into a NULL-terminated string */ + ctx->buf = apr_palloc(f->r->pool, (apr_size_t)(ctx->bblen+1)); + ctx->bytes = (apr_size_t)ctx->bblen; + rv = apr_brigade_flatten(ctx->bbsave, ctx->buf, &ctx->bytes); + ctx->buf[ctx->bytes] = 0; + sniff_encoding(f->r, ctx); + /* FIXME: hook here for rewriting start-of-data? */ + /* nah, we only have one action here - call it inline */ + fix_skipto(f->r, ctx); + + /* consume the data we just sniffed */ + /* we need to omit any we just invalidated */ + ctx->flags |= ENC_INITIALISED; + ap_set_module_config(f->r->request_config, &xml2enc_module, ctx); + } + if (ctx->bbnext == NULL) { + ctx->bbnext = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc); + } + + if (!ctx->convset) { + rv = ap_pass_brigade(f->next, ctx->bbsave); + apr_brigade_cleanup(ctx->bbsave); + ap_remove_output_filter(f); + return rv; + } + /* move the data back to bb */ + APR_BRIGADE_CONCAT(bb, ctx->bbsave); + + while (b = APR_BRIGADE_FIRST(bb), b != APR_BRIGADE_SENTINEL(bb)) { + ctx->bytes = 0; + if (APR_BUCKET_IS_METADATA(b)) { + if (APR_BUCKET_IS_EOS(b)) { + /* send remaining data */ + return ap_fflush(f->next, ctx->bbnext); + } else if (APR_BUCKET_IS_FLUSH(b)) { + ap_fflush(f->next, ctx->bbnext); + } + APR_BUCKET_REMOVE(b); + apr_bucket_destroy(b); + } else { /* data bucket */ + char* buf; + apr_size_t bytes = 0; + char fixbuf[BUFLEN]; + apr_bucket* bdestroy = NULL; + if (insz > 0) { /* we have dangling data. Flatten it. */ + buf = fixbuf; + bytes = BUFLEN; + rv = apr_brigade_flatten(bb, buf, &bytes); + if (bytes == insz) { + /* this is only what we've already tried to convert. + * The brigade is exhausted. + * Save remaining data for next time round + */ + + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, + "xml2enc: Setting aside %" APR_SIZE_T_FMT + " unconverted bytes", bytes); + rv = ap_fflush(f->next, ctx->bbnext); + APR_BRIGADE_CONCAT(ctx->bbsave, bb); + APR_BRIGADE_DO(b, ctx->bbsave) { + apr_bucket_setaside(b, f->r->pool); + } + return rv; + } + /* remove the data we've just read */ + rv = apr_brigade_partition(bb, bytes, &bstart); + while (b = APR_BRIGADE_FIRST(bb), b != bstart) { + APR_BUCKET_REMOVE(b); + apr_bucket_destroy(b); + } + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, "xml2enc: consuming %" + APR_SIZE_T_FMT " bytes flattened", bytes); + } + else { + rv = apr_bucket_read(b, (const char**)&buf, &bytes, APR_BLOCK_READ); + APR_BUCKET_REMOVE(b); + bdestroy = b; /* can't destroy until we've finished with the data */ + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, "xml2enc: consuming %" + APR_SIZE_T_FMT " bytes from bucket", bytes); + } + /* OK, we've got some input we can use in [buf,bytes] */ + if (rv == APR_SUCCESS) { + apr_size_t consumed; + xml2enc_run_preprocess(f, &buf, &bytes); + consumed = insz = bytes; + while (insz > 0) { + if (ctx->bytes == ctx->bblen) { + /* nothing was converted last time! + * break out of this loop! + */ + b = apr_bucket_transient_create(buf+(bytes - insz), insz, + bb->bucket_alloc); + APR_BRIGADE_INSERT_HEAD(bb, b); + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, + "xml2enc: reinserting %" APR_SIZE_T_FMT + " unconsumed bytes from bucket", insz); + break; + } + ctx->bytes = (apr_size_t)ctx->bblen; + rv = apr_xlate_conv_buffer(ctx->convset, buf+(bytes - insz), &insz, + ctx->buf, &ctx->bytes); + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, f->r, + "xml2enc: converted %" APR_SIZE_T_FMT "/%" APR_OFF_T_FMT " bytes", + consumed - insz, ctx->bblen - ctx->bytes); +#if DEBUG_XML2ENC + /* never use this in the wild */ + { + static int serial = 0; + const char* fname ; + apr_file_t* file ; + fname = apr_psprintf(f->r->pool, "/tmp/%d-xml2enc.%d", rv, serial++); + apr_file_open(&file, fname, APR_WRITE|APR_TRUNCATE|APR_CREATE, + APR_FPROT_OS_DEFAULT, f->r->pool); + apr_file_write(file, buf+(bytes-consumed), &consumed); + apr_file_close(file); + } +#endif + consumed = insz; + ap_fwrite(f->next, ctx->bbnext, ctx->buf, (apr_size_t)ctx->bblen - ctx->bytes); + switch (rv) { + case APR_SUCCESS: + continue; + case APR_EINCOMPLETE: + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, "INCOMPLETE"); + continue; /* If outbuf was too small, go round again. + * If it was inbuf, we'll break out when we test + * ctx->bytes == ctx->bblen + */ + case APR_EINVAL: /* try skipping one bad byte */ + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, + "Skipping invalid byte(s) in input stream!"); + --insz; + continue; + default: + /* Erk! What's this? + * Bail out, flush, and hope to eat the buf raw + */ + ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, + "Failed to convert input; trying it raw") ; + ctx->convset = NULL; + ap_fflush(f->next, ctx->bbnext); + return ap_pass_brigade(f->next, ctx->bbnext); + } + } + } else { + ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, + "xml2enc: error reading data") ; + } + if (bdestroy) { + apr_bucket_destroy(bdestroy); + } + } + } + return APR_SUCCESS; +} +static apr_status_t xml2enc_charset(request_rec* r, xmlCharEncoding* encp, + const char** encoding) { + xml2ctx* ctx = ap_get_module_config(r->request_config, &xml2enc_module); + if (!ctx || !(ctx->flags & ENC_INITIALISED)) { + return APR_EAGAIN; + } + *encp = ctx->xml2enc; + *encoding = ctx->encoding; + return HAVE_ENCODING(ctx->xml2enc) ? APR_SUCCESS : APR_EGENERAL; +} +#define PROTO_FLAGS AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH +static void xml2enc_hooks(apr_pool_t* pool) { + ap_register_output_filter_protocol("xml2enc", xml2enc_ffunc, + xml2enc_filter_init, AP_FTYPE_RESOURCE, PROTO_FLAGS); + APR_REGISTER_OPTIONAL_FN(xml2enc_filter); + APR_REGISTER_OPTIONAL_FN(xml2enc_charset); + seek_meta_ctype = ap_pregcomp(pool, + "(]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)", + AP_REG_EXTENDED|AP_REG_ICASE) ; + seek_charset = ap_pregcomp(pool, "charset=([A-Za-z0-9_-]+)", + AP_REG_EXTENDED|AP_REG_ICASE) ; +} +static const char* set_alias(cmd_parms* cmd, void* CFG, + const char* charset, const char* alias) { + const char* errmsg = ap_check_cmd_context(cmd, GLOBAL_ONLY); + if (errmsg != NULL) + return errmsg ; + else if (xmlAddEncodingAlias(charset, alias) == 0) + return NULL; + else + return "Error setting charset alias"; +} + +static const char* set_default(cmd_parms* cmd, void* CFG, const char* charset) { + xml2cfg* cfg = CFG; + cfg->default_charset = charset; + cfg->default_encoding = xmlParseCharEncoding(charset); +#if 0 + switch(cfg->default_encoding) { + case XML_CHAR_ENCODING_NONE: + return "Default charset not found"; + case XML_CHAR_ENCODING_ERROR: + /*return "Invalid or unsupported default charset";*/ + default: + return NULL; + } +#endif + return NULL; +} +static const char* set_skipto(cmd_parms* cmd, void* CFG, const char* arg) { + tattr* attr; + xml2cfg* cfg = CFG; + if (cfg->skipto == NULL) + cfg->skipto = apr_array_make(cmd->pool, 4, sizeof(tattr)); + attr = apr_array_push(cfg->skipto) ; + attr->val = arg; + return NULL ; +} + +static const command_rec xml2enc_cmds[] = { + AP_INIT_TAKE1("xml2EncDefault", set_default, NULL, OR_ALL, + "Usage: xml2EncDefault charset") , + AP_INIT_ITERATE2("xml2EncAlias", set_alias, NULL, RSRC_CONF, + "EncodingAlias charset alias [more aliases]") , + AP_INIT_ITERATE("xml2StartParse", set_skipto, NULL, OR_ALL, + "Ignore anything in front of the first of these elements") , + { NULL } +}; +static void* xml2enc_config(apr_pool_t* pool, char* x) { + xml2cfg* ret = apr_pcalloc(pool, sizeof(xml2cfg)); + ret->default_encoding = XML_CHAR_ENCODING_NONE ; + return ret; +} + +static void* xml2enc_merge(apr_pool_t* pool, void* BASE, void* ADD) { + xml2cfg* base = BASE; + xml2cfg* add = ADD; + xml2cfg* ret = apr_pcalloc(pool, sizeof(xml2cfg)); + ret->default_encoding = (add->default_encoding == XML_CHAR_ENCODING_NONE) + ? base->default_encoding : add->default_encoding ; + ret->default_charset = add->default_charset ? add->default_charset : base->default_charset; + ret->skipto = add->skipto ? add->skipto : base->skipto; + return ret; +} +module AP_MODULE_DECLARE_DATA xml2enc_module = { + STANDARD20_MODULE_STUFF, + xml2enc_config, + xml2enc_merge, + NULL, + NULL, + xml2enc_cmds, + xml2enc_hooks +}; +APR_IMPLEMENT_OPTIONAL_HOOK_RUN_ALL(xml2enc, XML2ENC, int, preprocess, + (ap_filter_t *f, char** bufp, apr_size_t* bytesp), + (f, bufp, bytesp), OK, DECLINED) diff --git a/mod_xml2enc.h b/mod_xml2enc.h new file mode 100644 index 0000000..60c6c36 --- /dev/null +++ b/mod_xml2enc.h @@ -0,0 +1,39 @@ +/* This is free software. Use and copy it wherever you want. */ +#ifndef MOD_XML2ENC +#define MOD_XML2ENC + +#define ENCIO_INPUT 0x01 +#define ENCIO_OUTPUT 0x02 +#define ENCIO_INPUT_CHECKS 0x04 +#define ENCIO (ENCIO_INPUT|ENCIO_OUTPUT|ENCIO_INPUT_CHECKS) +#define ENCIO_SKIPTO 0x10 + +/* declarations to deal with WIN32 compile-flag-in-source-code crap */ +#if !defined(WIN32) +#define XML2ENC_DECLARE(type) type +#define XML2ENC_DECLARE_NONSTD(type) type +#define XML2ENC_DECLARE_DATA +#elif defined(XML2ENC_DECLARE_STATIC) +#define XML2ENC_DECLARE(type) type __stdcall +#define XML2ENC_DECLARE_NONSTD(type) type +#define XML2ENC_DECLARE_DATA +#elif defined(XML2ENC_DECLARE_EXPORT) +#define XML2ENC_DECLARE(type) __declspec(dllexport) type __stdcall +#define XML2ENC_DECLARE_NONSTD(type) __declspec(dllexport) type +#define XML2ENC_DECLARE_DATA __declspec(dllexport) +#else +#define XML2ENC_DECLARE(type) __declspec(dllimport) type __stdcall +#define XML2ENC_DECLARE_NONSTD(type) __declspec(dllimport) type +#define XML2ENC_DECLARE_DATA __declspec(dllimport) +#endif + +APR_DECLARE_OPTIONAL_FN(apr_status_t, xml2enc_charset, + (request_rec* r, xmlCharEncoding* enc, const char** cenc)); + +APR_DECLARE_OPTIONAL_FN(apr_status_t, xml2enc_filter, + (request_rec* r, const char* enc, unsigned int mode)); + +APR_DECLARE_EXTERNAL_HOOK(xml2enc, XML2ENC, int, preprocess, + (ap_filter_t *f, char** bufp, apr_size_t* bytesp)) + +#endif -- 2.11.0