summaryrefslogtreecommitdiff
path: root/plugins/mmutf8fix/mmutf8fix.c
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/mmutf8fix/mmutf8fix.c')
-rw-r--r--plugins/mmutf8fix/mmutf8fix.c318
1 files changed, 318 insertions, 0 deletions
diff --git a/plugins/mmutf8fix/mmutf8fix.c b/plugins/mmutf8fix/mmutf8fix.c
new file mode 100644
index 0000000..e207795
--- /dev/null
+++ b/plugins/mmutf8fix/mmutf8fix.c
@@ -0,0 +1,318 @@
+/* mmutf8fix.c
+ * fix invalid UTF8 sequences. This is begun as a very simple replacer
+ * of non-control characters, and actually breaks some UTF-8 encoding
+ * right now. If the module turns out to be useful, it should be enhanced
+ * to support modes that really detect invalid UTF8. In the longer term
+ * it could also be evolved into an any-charset-to-UTF8 converter. But
+ * first let's see if it really gets into widespread enough use.
+ *
+ * Copyright 2013 Adiscon GmbH.
+ *
+ * This file is part of rsyslog.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * -or-
+ * see COPYING.ASL20 in the source distribution
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "rsyslog.h"
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdint.h>
+#include "conf.h"
+#include "syslogd-types.h"
+#include "srUtils.h"
+#include "template.h"
+#include "module-template.h"
+#include "errmsg.h"
+
+MODULE_TYPE_OUTPUT
+MODULE_TYPE_NOKEEP
+MODULE_CNFNAME("mmutf8fix")
+
+
+DEFobjCurrIf(errmsg);
+DEF_OMOD_STATIC_DATA
+
+/* define operation modes we have */
+#define MODE_CC 0 /* just fix control characters */
+#define MODE_UTF8 1 /* do real UTF-8 fixing */
+
+/* config variables */
+typedef struct _instanceData {
+ uchar replChar;
+ uint8_t mode; /* operations mode */
+} instanceData;
+
+struct modConfData_s {
+ rsconf_t *pConf; /* our overall config object */
+};
+static modConfData_t *loadModConf = NULL;/* modConf ptr to use for the current load process */
+static modConfData_t *runModConf = NULL;/* modConf ptr to use for the current exec process */
+
+
+/* tables for interfacing with the v6 config system */
+/* action (instance) parameters */
+static struct cnfparamdescr actpdescr[] = {
+ { "mode", eCmdHdlrGetWord, 0 },
+ { "replacementchar", eCmdHdlrGetChar, 0 }
+};
+static struct cnfparamblk actpblk =
+ { CNFPARAMBLK_VERSION,
+ sizeof(actpdescr)/sizeof(struct cnfparamdescr),
+ actpdescr
+ };
+
+BEGINbeginCnfLoad
+CODESTARTbeginCnfLoad
+ loadModConf = pModConf;
+ pModConf->pConf = pConf;
+ENDbeginCnfLoad
+
+BEGINendCnfLoad
+CODESTARTendCnfLoad
+ENDendCnfLoad
+
+BEGINcheckCnf
+CODESTARTcheckCnf
+ENDcheckCnf
+
+BEGINactivateCnf
+CODESTARTactivateCnf
+ runModConf = pModConf;
+ENDactivateCnf
+
+BEGINfreeCnf
+CODESTARTfreeCnf
+ENDfreeCnf
+
+
+BEGINcreateInstance
+CODESTARTcreateInstance
+ENDcreateInstance
+
+
+BEGINisCompatibleWithFeature
+CODESTARTisCompatibleWithFeature
+ENDisCompatibleWithFeature
+
+
+BEGINfreeInstance
+CODESTARTfreeInstance
+ENDfreeInstance
+
+
+static inline void
+setInstParamDefaults(instanceData *pData)
+{
+ pData->mode = MODE_UTF8;
+ pData->replChar = ' ';
+}
+
+BEGINnewActInst
+ struct cnfparamvals *pvals;
+ int i;
+CODESTARTnewActInst
+ DBGPRINTF("newActInst (mmutf8fix)\n");
+ if((pvals = nvlstGetParams(lst, &actpblk, NULL)) == NULL) {
+ ABORT_FINALIZE(RS_RET_MISSING_CNFPARAMS);
+ }
+
+ CODE_STD_STRING_REQUESTnewActInst(1)
+ CHKiRet(OMSRsetEntry(*ppOMSR, 0, NULL, OMSR_TPL_AS_MSG));
+ CHKiRet(createInstance(&pData));
+ setInstParamDefaults(pData);
+
+ for(i = 0 ; i < actpblk.nParams ; ++i) {
+ if(!pvals[i].bUsed)
+ continue;
+ if(!strcmp(actpblk.descr[i].name, "mode")) {
+ if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf-8",
+ sizeof("utf-8")-1)) {
+ pData->mode = MODE_UTF8;
+ } else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters",
+ sizeof("controlcharacters")-1)) {
+ pData->mode = MODE_CC;
+ } else {
+ char *cstr = es_str2cstr(pvals[i].val.d.estr, NULL);
+ errmsg.LogError(0, RS_RET_INVLD_MODE,
+ "mmutf8fix: invalid mode '%s' - ignored",
+ cstr);
+ free(cstr);
+ }
+ } else if(!strcmp(actpblk.descr[i].name, "replacementchar")) {
+ pData->replChar = es_getBufAddr(pvals[i].val.d.estr)[0];
+ } else {
+ dbgprintf("mmutf8fix: program error, non-handled "
+ "param '%s'\n", actpblk.descr[i].name);
+ }
+ }
+
+CODE_STD_FINALIZERnewActInst
+ cnfparamvalsDestruct(pvals, &actpblk);
+ENDnewActInst
+
+
+BEGINdbgPrintInstInfo
+CODESTARTdbgPrintInstInfo
+ENDdbgPrintInstInfo
+
+
+BEGINtryResume
+CODESTARTtryResume
+ENDtryResume
+
+
+static inline void
+doCC(instanceData *pData, uchar *msg, int lenMsg)
+{
+ int i;
+
+ for(i = 0 ; i < lenMsg ; ++i) {
+ if(msg[i] < 32 || msg[i] > 126) {
+ msg[i] = pData->replChar;
+ }
+ }
+}
+
+/* fix an invalid multibyte sequence */
+static inline void
+fixInvldMBSeq(instanceData *pData, uchar *msg, int lenMsg, int strtIdx, int *endIdx, int8_t seqLen)
+{
+ int i;
+
+ *endIdx = strtIdx + seqLen;
+ if(*endIdx > lenMsg)
+ *endIdx = lenMsg;
+ for(i = strtIdx ; i < *endIdx ; ++i)
+ msg[i] = pData->replChar;
+}
+
+static inline void
+doUTF8(instanceData *pData, uchar *msg, int lenMsg)
+{
+ uchar c;
+ int8_t seqLen, bytesLeft = 0;
+ uint32_t codepoint;
+ int strtIdx, endIdx;
+ int i;
+
+ for(i = 0 ; i < lenMsg ; ++i) {
+ c = msg[i];
+ if(bytesLeft) {
+ if((c & 0xc0) != 0x80) {
+ /* sequence invalid, invalidate all bytes */
+ fixInvldMBSeq(pData, msg, lenMsg, strtIdx, &endIdx,
+ seqLen);
+ i = endIdx - 1;
+ bytesLeft = 0;
+ } else {
+ codepoint = (codepoint << 6) | (c & 0x3f);
+ --bytesLeft;
+ if(bytesLeft == 0) {
+ /* too-large codepoint? */
+ if(codepoint > 0x10FFFF) {
+ fixInvldMBSeq(pData, msg, lenMsg,
+ strtIdx, &endIdx,
+ seqLen);
+ }
+ }
+ }
+ } else {
+ if((c & 0x80) == 0) {
+ /* 1-byte sequence, US-ASCII */
+ ; /* nothing to do, all well */
+ } else if((c & 0xe0) == 0xc0) {
+ /* 2-byte sequence */
+ strtIdx = i;
+ seqLen = bytesLeft = 1;
+ codepoint = c & 0x1f;
+ } else if((c & 0xf0) == 0xe0) {
+ /* 3-byte sequence */
+ strtIdx = i;
+ seqLen = bytesLeft = 2;
+ codepoint = c & 0x0f;
+ } else if((c & 0xf8) == 0xf0) {
+ /* 4-byte sequence */
+ strtIdx = i;
+ seqLen = bytesLeft = 3;
+ codepoint = c & 0x07;
+ } else { /* invalid (5&6 byte forbidden by RFC3629) */
+ msg[i] = pData->replChar;
+ }
+ if(i+bytesLeft >= lenMsg) {
+ int dummy = lenMsg;
+ /* invalid, as rest of message cannot contain full char */
+ fixInvldMBSeq(pData, msg, lenMsg, strtIdx, &dummy, seqLen);
+ i = lenMsg - 1;
+ }
+ }
+ }
+}
+
+BEGINdoAction
+ msg_t *pMsg;
+ uchar *msg;
+ int lenMsg;
+CODESTARTdoAction
+ pMsg = (msg_t*) ppString[0];
+ lenMsg = getMSGLen(pMsg);
+ msg = getMSG(pMsg);
+ if(pData->mode == MODE_CC) {
+ doCC(pData, msg, lenMsg);
+ } else {
+ doUTF8(pData, msg, lenMsg);
+ }
+ENDdoAction
+
+
+BEGINparseSelectorAct
+CODESTARTparseSelectorAct
+CODE_STD_STRING_REQUESTparseSelectorAct(1)
+ if(strncmp((char*) p, ":mmutf8fix:", sizeof(":mmutf8fix:") - 1)) {
+ errmsg.LogError(0, RS_RET_LEGA_ACT_NOT_SUPPORTED,
+ "mmutf8fix supports only v6+ config format, use: "
+ "action(type=\"mmutf8fix\" ...)");
+ }
+ ABORT_FINALIZE(RS_RET_CONFLINE_UNPROCESSED);
+CODE_STD_FINALIZERparseSelectorAct
+ENDparseSelectorAct
+
+
+BEGINmodExit
+CODESTARTmodExit
+ objRelease(errmsg, CORE_COMPONENT);
+ENDmodExit
+
+
+BEGINqueryEtryPt
+CODESTARTqueryEtryPt
+CODEqueryEtryPt_STD_OMOD_QUERIES
+CODEqueryEtryPt_STD_CONF2_OMOD_QUERIES
+CODEqueryEtryPt_STD_CONF2_QUERIES
+ENDqueryEtryPt
+
+
+BEGINmodInit()
+CODESTARTmodInit
+ *ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */
+CODEmodInit_QueryRegCFSLineHdlr
+ DBGPRINTF("mmutf8fix: module compiled with rsyslog version %s.\n", VERSION);
+ CHKiRet(objUse(errmsg, CORE_COMPONENT));
+ENDmodInit