Description: Fix more cases of unaligned memory accesses Author: James Clarke Forwarded: https://github.com/haskell-crypto/cryptonite/pull/175 --- This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ --- a/cbits/cryptonite_align.h +++ b/cbits/cryptonite_align.h @@ -34,9 +34,24 @@ #define need_alignment(p,n) IS_ALIGNED(p,n) #endif +static inline uint32_t load_be32_aligned(const uint8_t *p) +{ + return be32_to_cpu(*((uint32_t *) p)); +} + +static inline uint64_t load_be64_aligned(const uint8_t *p) +{ + return be64_to_cpu(*((uint64_t *) p)); +} + +static inline uint64_t load_le64_aligned(const uint8_t *p) +{ + return le64_to_cpu(*((uint64_t *) p)); +} + static inline uint32_t load_le32_aligned(const uint8_t *p) { - return le32_to_cpu(*((uint32_t *) p)); + return le32_to_cpu(*((uint32_t *) p)); } static inline void store_le32_aligned(uint8_t *dst, const uint32_t v) @@ -60,12 +75,83 @@ static inline void store_be64_aligned(ui } #ifdef UNALIGNED_ACCESS_OK -#define load_le32(a) load_le32_aligned(a) + +#define load_be32(p) load_be32_aligned(p) +#define load_be64(p) load_be64_aligned(p) + +#define store_be32(p, v) store_be32_aligned((p), (v)) +#define store_be64(p, v) store_be64_aligned((p), (v)) + +#define load_le32(p) load_le32_aligned(p) +#define load_le64(p) load_le64_aligned(p) + +#define store_le32(p, v) store_le32_aligned((p), (v)) +#define store_le64(p, v) store_le64_aligned((p), (v)) + #else + +static inline uint32_t load_be32(const uint8_t *p) +{ + return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) | ((uint32_t)p[2] << 8) | ((uint32_t)p[3]); +} + +static inline uint64_t load_be64(const uint8_t *p) +{ + return ((uint64_t)p[0] << 56) | ((uint64_t)p[1] << 48) | ((uint64_t)p[2] << 40) | ((uint64_t)p[3] << 32) | + ((uint64_t)p[4] << 24) | ((uint64_t)p[5] << 16) | ((uint64_t)p[6] << 8) | ((uint64_t)p[7]); +} + +static inline void store_be32(uint8_t *p, uint32_t val) +{ + p[0] = (val >> 24); + p[1] = (val >> 16) & 0xFF; + p[2] = (val >> 8) & 0xFF; + p[3] = (val ) & 0xFF; +} + +static inline void store_be64(uint8_t *p, uint64_t val) +{ + p[0] = (val >> 56); + p[1] = (val >> 48) & 0xFF; + p[2] = (val >> 40) & 0xFF; + p[3] = (val >> 32) & 0xFF; + p[4] = (val >> 24) & 0xFF; + p[5] = (val >> 16) & 0xFF; + p[6] = (val >> 8) & 0xFF; + p[7] = (val ) & 0xFF; +} + static inline uint32_t load_le32(const uint8_t *p) { return ((uint32_t)p[0]) | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24); } + +static inline uint64_t load_le64(const uint8_t *p) +{ + return ((uint64_t)p[0]) | ((uint64_t)p[1] << 8) | ((uint64_t)p[2] << 16) | ((uint64_t)p[3] << 24) | + ((uint64_t)p[4] << 32) | ((uint64_t)p[5] << 40) | ((uint64_t)p[6] << 48) | ((uint64_t)p[7] << 56); +} + +static inline void store_le32(uint8_t *p, uint32_t val) +{ + p[0] = (val ) & 0xFF; + p[1] = (val >> 8) & 0xFF; + p[2] = (val >> 16) & 0xFF; + p[3] = (val >> 24); +} + +static inline void store_le64(uint8_t *p, uint64_t val) +{ + p[0] = (val ) & 0xFF; + p[1] = (val >> 8) & 0xFF; + p[2] = (val >> 16) & 0xFF; + p[3] = (val >> 24) & 0xFF; + p[4] = (val >> 32) & 0xFF; + p[5] = (val >> 40) & 0xFF; + p[6] = (val >> 48) & 0xFF; + p[7] = (val >> 56); +} + #endif #ifdef UNALIGNED_ACCESS_OK --- a/cbits/cryptonite_poly1305.c +++ b/cbits/cryptonite_poly1305.c @@ -37,11 +37,7 @@ #include #include "cryptonite_poly1305.h" #include "cryptonite_bitfn.h" - -static inline uint32_t load32(uint8_t *p) -{ - return (le32_to_cpu(*((uint32_t *) p))); -} +#include "cryptonite_align.h" static void poly1305_do_chunk(poly1305_ctx *ctx, uint8_t *data, int blocks, int final) { @@ -61,11 +57,11 @@ static void poly1305_do_chunk(poly1305_c s1 = r1 * 5; s2 = r2 * 5; s3 = r3 * 5; s4 = r4 * 5; while (blocks--) { - h0 += (load32(data+ 0) ) & 0x3ffffff; - h1 += (load32(data+ 3) >> 2) & 0x3ffffff; - h2 += (load32(data+ 6) >> 4) & 0x3ffffff; - h3 += (load32(data+ 9) >> 6) & 0x3ffffff; - h4 += (load32(data+12) >> 8) | hibit; + h0 += (load_le32(data+ 0) ) & 0x3ffffff; + h1 += (load_le32(data+ 3) >> 2) & 0x3ffffff; + h2 += (load_le32(data+ 6) >> 4) & 0x3ffffff; + h3 += (load_le32(data+ 9) >> 6) & 0x3ffffff; + h4 += (load_le32(data+12) >> 8) | hibit; d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1); d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2); @@ -94,16 +90,16 @@ void cryptonite_poly1305_init(poly1305_c memset(ctx, 0, sizeof(poly1305_ctx)); - ctx->r[0] = (load32(&k[ 0]) ) & 0x3ffffff; - ctx->r[1] = (load32(&k[ 3]) >> 2) & 0x3ffff03; - ctx->r[2] = (load32(&k[ 6]) >> 4) & 0x3ffc0ff; - ctx->r[3] = (load32(&k[ 9]) >> 6) & 0x3f03fff; - ctx->r[4] = (load32(&k[12]) >> 8) & 0x00fffff; - - ctx->pad[0] = load32(&k[16]); - ctx->pad[1] = load32(&k[20]); - ctx->pad[2] = load32(&k[24]); - ctx->pad[3] = load32(&k[28]); + ctx->r[0] = (load_le32(&k[ 0]) ) & 0x3ffffff; + ctx->r[1] = (load_le32(&k[ 3]) >> 2) & 0x3ffff03; + ctx->r[2] = (load_le32(&k[ 6]) >> 4) & 0x3ffc0ff; + ctx->r[3] = (load_le32(&k[ 9]) >> 6) & 0x3f03fff; + ctx->r[4] = (load_le32(&k[12]) >> 8) & 0x00fffff; + + ctx->pad[0] = load_le32(&k[16]); + ctx->pad[1] = load_le32(&k[20]); + ctx->pad[2] = load_le32(&k[24]); + ctx->pad[3] = load_le32(&k[28]); ctx->index = 0; } --- a/cbits/cryptonite_aes.c +++ b/cbits/cryptonite_aes.c @@ -370,7 +370,7 @@ void cryptonite_aes_gcm_init(aes_gcm *gc cryptonite_gf_mul(&gcm->iv, &gcm->h); } - block128_copy(&gcm->civ, &gcm->iv); + block128_copy_aligned(&gcm->civ, &gcm->iv); } void cryptonite_aes_gcm_aad(aes_gcm *gcm, uint8_t *input, uint32_t length) @@ -399,7 +399,7 @@ void cryptonite_aes_gcm_finish(uint8_t * gcm_ghash_add(gcm, &lblock); cryptonite_aes_encrypt_block(&lblock, key, &gcm->iv); - block128_xor(&gcm->tag, &lblock); + block128_xor_aligned(&gcm->tag, &lblock); for (i = 0; i < 16; i++) { tag[i] = gcm->tag.b[i]; @@ -464,7 +464,7 @@ void cryptonite_aes_ocb_init(aes_ocb *oc memcpy(stretch, ktop.b, 16); memcpy(tmp.b, ktop.b + 1, 8); - block128_xor(&tmp, &ktop); + block128_xor_aligned(&tmp, &ktop); memcpy(stretch + 16, tmp.b, 8); /* initialize the encryption offset from stretch */ @@ -490,22 +490,22 @@ void cryptonite_aes_ocb_aad(aes_ocb *ocb for (i=1; i<= length/16; i++, input=input+16) { ocb_get_L_i(&tmp, ocb->li, i); - block128_xor(&ocb->offset_aad, &tmp); + block128_xor_aligned(&ocb->offset_aad, &tmp); block128_vxor(&tmp, &ocb->offset_aad, (block128 *) input); cryptonite_aes_encrypt_block(&tmp, key, &tmp); - block128_xor(&ocb->sum_aad, &tmp); + block128_xor_aligned(&ocb->sum_aad, &tmp); } length = length % 16; /* Bytes in final block */ if (length > 0) { - block128_xor(&ocb->offset_aad, &ocb->lstar); + block128_xor_aligned(&ocb->offset_aad, &ocb->lstar); block128_zero(&tmp); block128_copy_bytes(&tmp, input, length); tmp.b[length] = 0x80; - block128_xor(&tmp, &ocb->offset_aad); + block128_xor_aligned(&tmp, &ocb->offset_aad); cryptonite_aes_encrypt_block(&tmp, key, &tmp); - block128_xor(&ocb->sum_aad, &tmp); + block128_xor_aligned(&ocb->sum_aad, &tmp); } } @@ -513,8 +513,8 @@ void cryptonite_aes_ocb_finish(uint8_t * { block128 tmp; - block128_vxor(&tmp, &ocb->sum_enc, &ocb->offset_enc); - block128_xor(&tmp, &ocb->ldollar); + block128_vxor_aligned(&tmp, &ocb->sum_enc, &ocb->offset_enc); + block128_xor_aligned(&tmp, &ocb->ldollar); cryptonite_aes_encrypt_block((block128 *) tag, key, &tmp); block128_xor((block128 *) tag, &ocb->sum_aad); } @@ -699,7 +699,7 @@ static void ocb_generic_crypt(uint8_t *o for (i = 1; i <= length/16; i++, input += 16, output += 16) { /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ ocb_get_L_i(&tmp, ocb->li, i); - block128_xor(&ocb->offset_enc, &tmp); + block128_xor_aligned(&ocb->offset_enc, &tmp); block128_vxor(&tmp, &ocb->offset_enc, (block128 *) input); if (encrypt) { @@ -716,24 +716,24 @@ static void ocb_generic_crypt(uint8_t *o /* process the last partial block if any */ length = length % 16; if (length > 0) { - block128_xor(&ocb->offset_enc, &ocb->lstar); + block128_xor_aligned(&ocb->offset_enc, &ocb->lstar); cryptonite_aes_encrypt_block(&pad, key, &ocb->offset_enc); if (encrypt) { block128_zero(&tmp); block128_copy_bytes(&tmp, input, length); tmp.b[length] = 0x80; - block128_xor(&ocb->sum_enc, &tmp); - block128_xor(&pad, &tmp); + block128_xor_aligned(&ocb->sum_enc, &tmp); + block128_xor_aligned(&pad, &tmp); memcpy(output, pad.b, length); output += length; } else { - block128_copy(&tmp, &pad); + block128_copy_aligned(&tmp, &pad); block128_copy_bytes(&tmp, input, length); - block128_xor(&tmp, &pad); + block128_xor_aligned(&tmp, &pad); tmp.b[length] = 0x80; memcpy(output, tmp.b, length); - block128_xor(&ocb->sum_enc, &tmp); + block128_xor_aligned(&ocb->sum_enc, &tmp); input += length; } } --- a/cbits/cryptonite_md4.c +++ b/cbits/cryptonite_md4.c @@ -25,6 +25,7 @@ #include #include #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" #include "cryptonite_md4.h" void cryptonite_md4_init(struct md4_ctx *ctx) @@ -130,9 +131,18 @@ void cryptonite_md4_update(struct md4_ct index = 0; } - /* process as much 64-block as possible */ - for (; len >= 64; len -= 64, data += 64) - md4_do_chunk(ctx, (uint32_t *) data); + if (need_alignment(data, 4)) { + uint32_t tramp[16]; + ASSERT_ALIGNMENT(tramp, 4); + for (; len >= 64; len -= 64, data += 64) { + memcpy(tramp, data, 64); + md4_do_chunk(ctx, tramp); + } + } else { + /* process as much 64-block as possible */ + for (; len >= 64; len -= 64, data += 64) + md4_do_chunk(ctx, (uint32_t *) data); + } /* append data into buf */ if (len) @@ -157,5 +167,8 @@ void cryptonite_md4_finalize(struct md4_ cryptonite_md4_update(ctx, (uint8_t *) &bits, sizeof(bits)); /* output hash */ - le32_to_cpu_array((uint32_t *) out, ctx->h, 4); + store_le32(out , ctx->h[0]); + store_le32(out+ 4, ctx->h[1]); + store_le32(out+ 8, ctx->h[2]); + store_le32(out+12, ctx->h[3]); } --- a/cbits/cryptonite_md5.c +++ b/cbits/cryptonite_md5.c @@ -25,6 +25,7 @@ #include #include #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" #include "cryptonite_md5.h" void cryptonite_md5_init(struct md5_ctx *ctx) @@ -143,9 +144,18 @@ void cryptonite_md5_update(struct md5_ct index = 0; } - /* process as much 64-block as possible */ - for (; len >= 64; len -= 64, data += 64) - md5_do_chunk(ctx, (uint32_t *) data); + if (need_alignment(data, 4)) { + uint32_t tramp[16]; + ASSERT_ALIGNMENT(tramp, 4); + for (; len >= 64; len -= 64, data += 64) { + memcpy(tramp, data, 64); + md5_do_chunk(ctx, tramp); + } + } else { + /* process as much 64-block as possible */ + for (; len >= 64; len -= 64, data += 64) + md5_do_chunk(ctx, (uint32_t *) data); + } /* append data into buf */ if (len) @@ -157,7 +167,6 @@ void cryptonite_md5_finalize(struct md5_ static uint8_t padding[64] = { 0x80, }; uint64_t bits; uint32_t index, padlen; - uint32_t *p = (uint32_t *) out; /* add padding and update data with it */ bits = cpu_to_le64(ctx->sz << 3); @@ -171,8 +180,8 @@ void cryptonite_md5_finalize(struct md5_ cryptonite_md5_update(ctx, (uint8_t *) &bits, sizeof(bits)); /* output hash */ - p[0] = cpu_to_le32(ctx->h[0]); - p[1] = cpu_to_le32(ctx->h[1]); - p[2] = cpu_to_le32(ctx->h[2]); - p[3] = cpu_to_le32(ctx->h[3]); + store_le32(out , ctx->h[0]); + store_le32(out+ 4, ctx->h[1]); + store_le32(out+ 8, ctx->h[2]); + store_le32(out+12, ctx->h[3]); } --- a/cbits/cryptonite_ripemd.c +++ b/cbits/cryptonite_ripemd.c @@ -24,6 +24,7 @@ #include "cryptonite_ripemd.h" #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" #include void cryptonite_ripemd160_init(struct ripemd160_ctx *ctx) @@ -265,9 +266,20 @@ void cryptonite_ripemd160_update(struct index = 0; } - for (; len >= 64; len -= 64, data += 64) - ripemd160_do_chunk(ctx, (uint32_t *) data); + if (need_alignment(data, 4)) { + uint32_t tramp[16]; + ASSERT_ALIGNMENT(tramp, 4); + for (; len >= 64; len -= 64, data += 64) { + memcpy(tramp, data, 64); + ripemd160_do_chunk(ctx, tramp); + } + } else { + /* process as much 64-block as possible */ + for (; len >= 64; len -= 64, data += 64) + ripemd160_do_chunk(ctx, (uint32_t *) data); + } + /* append data into buf */ if (len) memcpy(ctx->buf + index, data, len); } @@ -277,7 +289,6 @@ void cryptonite_ripemd160_finalize(struc static uint8_t padding[64] = { 0x80, }; uint64_t bits; uint32_t index, padlen; - uint32_t *p = (uint32_t *) out; /* add padding and update data with it */ bits = cpu_to_le64(ctx->sz << 3); @@ -291,9 +302,9 @@ void cryptonite_ripemd160_finalize(struc cryptonite_ripemd160_update(ctx, (uint8_t *) &bits, sizeof(bits)); /* output digest */ - p[0] = cpu_to_le32(ctx->h[0]); - p[1] = cpu_to_le32(ctx->h[1]); - p[2] = cpu_to_le32(ctx->h[2]); - p[3] = cpu_to_le32(ctx->h[3]); - p[4] = cpu_to_le32(ctx->h[4]); + store_le32(out , ctx->h[0]); + store_le32(out+ 4, ctx->h[1]); + store_le32(out+ 8, ctx->h[2]); + store_le32(out+12, ctx->h[3]); + store_le32(out+16, ctx->h[4]); } --- a/cbits/cryptonite_salsa.c +++ b/cbits/cryptonite_salsa.c @@ -33,6 +33,7 @@ #include #include "cryptonite_salsa.h" #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" static const uint8_t sigma[16] = "expand 32-byte k"; static const uint8_t tau[16] = "expand 16-byte k"; @@ -58,11 +59,6 @@ static const uint8_t tau[16] = "expand 1 QR (x15,x12,x13,x14); \ } -static inline uint32_t load32(const uint8_t *p) -{ - return le32_to_cpu(*((uint32_t *) p)); -} - static void salsa_core(int rounds, block *out, const cryptonite_salsa_state *in) { uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; @@ -126,34 +122,34 @@ void cryptonite_salsa_init_core(cryptoni const uint8_t *constants = (keylen == 32) ? sigma : tau; int i; - st->d[0] = load32(constants + 0); - st->d[5] = load32(constants + 4); - st->d[10] = load32(constants + 8); - st->d[15] = load32(constants + 12); - - st->d[1] = load32(key + 0); - st->d[2] = load32(key + 4); - st->d[3] = load32(key + 8); - st->d[4] = load32(key + 12); + st->d[0] = load_le32_aligned(constants + 0); + st->d[5] = load_le32_aligned(constants + 4); + st->d[10] = load_le32_aligned(constants + 8); + st->d[15] = load_le32_aligned(constants + 12); + + st->d[1] = load_le32(key + 0); + st->d[2] = load_le32(key + 4); + st->d[3] = load_le32(key + 8); + st->d[4] = load_le32(key + 12); /* we repeat the key on 128 bits */ if (keylen == 32) key += 16; - st->d[11] = load32(key + 0); - st->d[12] = load32(key + 4); - st->d[13] = load32(key + 8); - st->d[14] = load32(key + 12); + st->d[11] = load_le32(key + 0); + st->d[12] = load_le32(key + 4); + st->d[13] = load_le32(key + 8); + st->d[14] = load_le32(key + 12); st->d[9] = 0; switch (ivlen) { case 8: - st->d[6] = load32(iv + 0); - st->d[7] = load32(iv + 4); + st->d[6] = load_le32(iv + 0); + st->d[7] = load_le32(iv + 4); st->d[8] = 0; break; case 12: - st->d[6] = load32(iv + 0); - st->d[7] = load32(iv + 4); - st->d[8] = load32(iv + 8); + st->d[6] = load_le32(iv + 0); + st->d[7] = load_le32(iv + 4); + st->d[8] = load_le32(iv + 8); default: return; } --- a/cbits/cryptonite_scrypt.c +++ b/cbits/cryptonite_scrypt.c @@ -27,6 +27,7 @@ #include #include #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" #include "cryptonite_salsa.h" static void blockmix_salsa8(uint32_t *in, uint32_t *out, uint32_t *X, const uint32_t r) @@ -49,16 +50,6 @@ static inline uint64_t integerify(uint32 return B[(2*r-1) * 16] | (uint64_t)B[(2*r-1) * 16 + 1] << 32; } -static inline uint32_t load32(const uint8_t *p) -{ - return le32_to_cpu(*((uint32_t *) p)); -} - -static inline void store32(const uint8_t *p, uint32_t val) -{ - *((uint32_t *) p) = cpu_to_le32(val); -} - void cryptonite_scrypt_smix(uint8_t *B, const uint32_t r, const uint64_t N, uint32_t *V, uint32_t *XY) { uint32_t *X = XY; @@ -69,7 +60,7 @@ void cryptonite_scrypt_smix(uint8_t *B, const int r32 = 32*r; for (k = 0; k < r32; k++) - X[k] = load32(&B[4 * k]); + X[k] = load_le32_aligned(&B[4 * k]); for (i = 0; i < N; i += 2) { array_copy32(&V[i * r32], X, r32); blockmix_salsa8(X, Y, Z, r); @@ -86,5 +77,5 @@ void cryptonite_scrypt_smix(uint8_t *B, blockmix_salsa8(Y, X, Z, r); } for (k = 0; k < r32; k++) - store32(&B[4*k], X[k]); + store_le32_aligned(&B[4*k], X[k]); } --- a/cbits/cryptonite_sha1.c +++ b/cbits/cryptonite_sha1.c @@ -25,6 +25,7 @@ #include #include "cryptonite_sha1.h" #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" void cryptonite_sha1_init(struct sha1_ctx *ctx) { @@ -173,9 +174,18 @@ void cryptonite_sha1_update(struct sha1_ index = 0; } - /* process as much 64-block as possible */ - for (; len >= 64; len -= 64, data += 64) - sha1_do_chunk(ctx, (uint32_t *) data); + if (need_alignment(data, 4)) { + uint32_t tramp[16]; + ASSERT_ALIGNMENT(tramp, 4); + for (; len >= 64; len -= 64, data += 64) { + memcpy(tramp, data, 64); + sha1_do_chunk(ctx, tramp); + } + } else { + /* process as much 64-block as possible */ + for (; len >= 64; len -= 64, data += 64) + sha1_do_chunk(ctx, (uint32_t *) data); + } /* append data into buf */ if (len) @@ -187,7 +197,6 @@ void cryptonite_sha1_finalize(struct sha static uint8_t padding[64] = { 0x80, }; uint64_t bits; uint32_t index, padlen; - uint32_t *p = (uint32_t *) out; /* add padding and update data with it */ bits = cpu_to_be64(ctx->sz << 3); @@ -201,9 +210,9 @@ void cryptonite_sha1_finalize(struct sha cryptonite_sha1_update(ctx, (uint8_t *) &bits, sizeof(bits)); /* output hash */ - p[0] = cpu_to_be32(ctx->h[0]); - p[1] = cpu_to_be32(ctx->h[1]); - p[2] = cpu_to_be32(ctx->h[2]); - p[3] = cpu_to_be32(ctx->h[3]); - p[4] = cpu_to_be32(ctx->h[4]); + store_be32(out , ctx->h[0]); + store_be32(out+ 4, ctx->h[1]); + store_be32(out+ 8, ctx->h[2]); + store_be32(out+12, ctx->h[3]); + store_be32(out+16, ctx->h[4]); } --- a/cbits/cryptonite_sha256.c +++ b/cbits/cryptonite_sha256.c @@ -25,6 +25,7 @@ #include #include "cryptonite_sha256.h" #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" void cryptonite_sha224_init(struct sha224_ctx *ctx) { @@ -134,9 +135,18 @@ void cryptonite_sha256_update(struct sha index = 0; } - /* process as much 64-block as possible */ - for (; len >= 64; len -= 64, data += 64) - sha256_do_chunk(ctx, (uint32_t *) data); + if (need_alignment(data, 4)) { + uint32_t tramp[16]; + ASSERT_ALIGNMENT(tramp, 4); + for (; len >= 64; len -= 64, data += 64) { + memcpy(tramp, data, 64); + sha256_do_chunk(ctx, tramp); + } + } else { + /* process as much 64-block as possible */ + for (; len >= 64; len -= 64, data += 64) + sha256_do_chunk(ctx, (uint32_t *) data); + } /* append data into buf */ if (len) @@ -156,7 +166,6 @@ void cryptonite_sha256_finalize(struct s static uint8_t padding[64] = { 0x80, }; uint64_t bits; uint32_t i, index, padlen; - uint32_t *p = (uint32_t *) out; /* cpu -> big endian */ bits = cpu_to_be64(ctx->sz << 3); @@ -171,5 +180,5 @@ void cryptonite_sha256_finalize(struct s /* store to digest */ for (i = 0; i < 8; i++) - p[i] = cpu_to_be32(ctx->h[i]); + store_be32(out+4*i, ctx->h[i]); } --- a/cbits/cryptonite_skein256.c +++ b/cbits/cryptonite_skein256.c @@ -26,6 +26,7 @@ #include "cryptonite_skein.h" #include "cryptonite_skein256.h" #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" static const uint8_t K256_0[2] = { 14, 16, }; static const uint8_t K256_1[2] = { 52, 57, }; @@ -143,9 +144,18 @@ void cryptonite_skein256_update(struct s ctx->bufindex = 0; } - /* process as much 32-block as possible except the last one in case we finalize */ - for (; len > 32; len -= 32, data += 32) - skein256_do_chunk(ctx, (uint64_t *) data, 32); + if (need_alignment(data, 8)) { + uint64_t tramp[4]; + ASSERT_ALIGNMENT(tramp, 8); + for (; len > 32; len -= 32, data += 32) { + memcpy(tramp, data, 32); + skein256_do_chunk(ctx, tramp, 32); + } + } else { + /* process as much 32-block as possible except the last one in case we finalize */ + for (; len > 32; len -= 32, data += 32) + skein256_do_chunk(ctx, (uint64_t *) data, 32); + } /* append data into buf */ if (len) { --- a/cbits/cryptonite_skein512.c +++ b/cbits/cryptonite_skein512.c @@ -26,6 +26,7 @@ #include "cryptonite_skein.h" #include "cryptonite_skein512.h" #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" static const uint8_t K512_0[4] = { 46, 36, 19, 37, }; static const uint8_t K512_1[4] = { 33, 27, 14, 42, }; @@ -161,9 +162,18 @@ void cryptonite_skein512_update(struct s ctx->bufindex = 0; } - /* process as much 64-block as possible except the last one in case we finalize */ - for (; len > 64; len -= 64, data += 64) - skein512_do_chunk(ctx, (uint64_t *) data, 64); + if (need_alignment(data, 8)) { + uint64_t tramp[8]; + ASSERT_ALIGNMENT(tramp, 8); + for (; len > 64; len -= 64, data += 64) { + memcpy(tramp, data, 64); + skein512_do_chunk(ctx, tramp, 64); + } + } else { + /* process as much 64-block as possible except the last one in case we finalize */ + for (; len > 64; len -= 64, data += 64) + skein512_do_chunk(ctx, (uint64_t *) data, 64); + } /* append data into buf */ if (len) { --- a/cbits/cryptonite_tiger.c +++ b/cbits/cryptonite_tiger.c @@ -25,6 +25,7 @@ #include #include "cryptonite_tiger.h" #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" static const uint64_t t1[256] = { 0x02aab17cf7e90c5eULL,0xac424b03e243a8ecULL,0x72cd5be30dd5fcd3ULL,0x6d019b93f6f97f3aULL, @@ -381,9 +382,18 @@ void cryptonite_tiger_update(struct tige index = 0; } - /* process as much 64-block as possible */ - for (; len >= 64; len -= 64, data += 64) - tiger_do_chunk(ctx, (uint64_t *) data); + if (need_alignment(data, 8)) { + uint64_t tramp[8]; + ASSERT_ALIGNMENT(tramp, 8); + for (; len >= 64; len -= 64, data += 64) { + memcpy(tramp, data, 64); + tiger_do_chunk(ctx, tramp); + } + } else { + /* process as much 64-block as possible */ + for (; len >= 64; len -= 64, data += 64) + tiger_do_chunk(ctx, (uint64_t *) data); + } /* append data into buf */ if (len) @@ -395,7 +405,6 @@ void cryptonite_tiger_finalize(struct ti static uint8_t padding[64] = { 0x01, }; uint64_t bits; uint32_t index, padlen; - uint64_t *p = (uint64_t *) out; /* add padding and update data with it */ bits = cpu_to_le64(ctx->sz << 3); @@ -409,7 +418,7 @@ void cryptonite_tiger_finalize(struct ti cryptonite_tiger_update(ctx, (uint8_t *) &bits, sizeof(bits)); /* output hash */ - p[0] = cpu_to_le64(ctx->h[0]); - p[1] = cpu_to_le64(ctx->h[1]); - p[2] = cpu_to_le64(ctx->h[2]); + store_le64(out , ctx->h[0]); + store_le64(out+ 8, ctx->h[1]); + store_le64(out+16, ctx->h[2]); } --- a/cbits/cryptonite_xsalsa.c +++ b/cbits/cryptonite_xsalsa.c @@ -30,13 +30,9 @@ #include #include #include "cryptonite_xsalsa.h" +#include "cryptonite_align.h" #include "cryptonite_bitfn.h" -static inline uint32_t load32(const uint8_t *p) -{ - return le32_to_cpu(*((uint32_t *) p)); -} - /* XSalsa20 algorithm as described in https://cr.yp.to/snuffle/xsalsa-20081128.pdf */ void cryptonite_xsalsa_init(cryptonite_salsa_context *ctx, uint8_t nb_rounds, uint32_t keylen, const uint8_t *key, @@ -51,8 +47,8 @@ void cryptonite_xsalsa_init(cryptonite_s (x6, x7, x8, x9) is the first 128 bits of a 192-bit nonce */ cryptonite_salsa_init_core(&ctx->st, keylen, key, 8, iv); - ctx->st.d[ 8] = load32(iv + 8); - ctx->st.d[ 9] = load32(iv + 12); + ctx->st.d[ 8] = load_le32(iv + 8); + ctx->st.d[ 9] = load_le32(iv + 12); /* Compute (z0, z1, . . . , z15) = doubleround ^(r/2) (x0, x1, . . . , x15) */ block hSalsa; @@ -73,8 +69,8 @@ void cryptonite_xsalsa_init(cryptonite_s ctx->st.d[12] = hSalsa.d[ 7] - ctx->st.d[ 7]; ctx->st.d[13] = hSalsa.d[ 8] - ctx->st.d[ 8]; ctx->st.d[14] = hSalsa.d[ 9] - ctx->st.d[ 9]; - ctx->st.d[ 6] = load32(iv + 16); - ctx->st.d[ 7] = load32(iv + 20); + ctx->st.d[ 6] = load_le32(iv + 16); + ctx->st.d[ 7] = load_le32(iv + 20); ctx->st.d[ 8] = 0; ctx->st.d[ 9] = 0; -} \ No newline at end of file +} --- a/cbits/aes/block128.h +++ b/cbits/aes/block128.h @@ -32,6 +32,7 @@ #define BLOCK128_H #include +#include typedef union { uint64_t q[2]; @@ -40,38 +41,71 @@ typedef union { uint8_t b[16]; } block128; -static inline void block128_copy_bytes(block128 *block, uint8_t *src, uint32_t len) +static inline void block128_copy_bytes(block128 *block, const uint8_t *src, uint32_t len) { int i; for (i = 0; i < len; i++) block->b[i] = src[i]; } -static inline void block128_copy(block128 *d, const block128 *s) +static inline void block128_copy_aligned(block128 *d, const block128 *s) { d->q[0] = s->q[0]; d->q[1] = s->q[1]; } +static inline void block128_copy(block128 *d, const block128 *s) +{ + if (need_alignment(d, 8) || need_alignment(s, 8)) { + block128_copy_bytes(d, (const uint8_t *) s, 16); + } else { + block128_copy_aligned(d, s); + } +} + static inline void block128_zero(block128 *d) { d->q[0] = 0; d->q[1] = 0; } -static inline void block128_xor(block128 *d, const block128 *s) +static inline void block128_xor_bytes(block128 *block, const uint8_t *src, uint32_t len) +{ + int i; + for (i = 0; i < len; i++) block->b[i] ^= src[i]; +} + +static inline void block128_xor_aligned(block128 *d, const block128 *s) { d->q[0] ^= s->q[0]; d->q[1] ^= s->q[1]; } -static inline void block128_vxor(block128 *d, const block128 *s1, const block128 *s2) +static inline void block128_xor(block128 *d, const block128 *s) +{ + if (need_alignment(d, 8) || need_alignment(s, 8)) { + block128_xor_bytes(d, (const uint8_t *) s, 16); + } else { + block128_xor_aligned(d, s); + } +} + +static inline void block128_vxor_bytes(block128 *block, const uint8_t *src1, const uint8_t *src2, uint32_t len) +{ + int i; + for (i = 0; i < len; i++) block->b[i] = src1[i] ^ src2[i]; +} + +static inline void block128_vxor_aligned(block128 *d, const block128 *s1, const block128 *s2) { d->q[0] = s1->q[0] ^ s2->q[0]; d->q[1] = s1->q[1] ^ s2->q[1]; } -static inline void block128_xor_bytes(block128 *block, uint8_t *src, uint32_t len) +static inline void block128_vxor(block128 *d, const block128 *s1, const block128 *s2) { - int i; - for (i = 0; i < len; i++) block->b[i] ^= src[i]; + if (need_alignment(d, 8) || need_alignment(s1, 8) || need_alignment(s2, 8)) { + block128_vxor_bytes(d, (const uint8_t *) s1, (const uint8_t *) s2, 16); + } else { + block128_vxor_aligned(d, s1, s2); + } } static inline void block128_inc_be(block128 *b) --- a/cbits/aes/generic.c +++ b/cbits/aes/generic.c @@ -324,21 +324,22 @@ static void create_round_key(uint8_t *ex static void aes_main(aes_key *key, uint8_t *state) { int i = 0; - uint8_t rk[16]; + uint32_t rk[4]; + uint8_t *rkptr = (uint8_t *) rk; - create_round_key(key->data, rk); - add_round_key(state, rk); + create_round_key(key->data, rkptr); + add_round_key(state, rkptr); for (i = 1; i < key->nbr; i++) { - create_round_key(key->data + 16 * i, rk); + create_round_key(key->data + 16 * i, rkptr); shift_rows(state); mix_columns(state); - add_round_key(state, rk); + add_round_key(state, rkptr); } - create_round_key(key->data + 16 * key->nbr, rk); + create_round_key(key->data + 16 * key->nbr, rkptr); shift_rows(state); - add_round_key(state, rk); + add_round_key(state, rkptr); } static void shift_rows_inv(uint8_t *state) @@ -374,21 +375,22 @@ static void mix_columns_inv(uint8_t *sta static void aes_main_inv(aes_key *key, uint8_t *state) { int i = 0; - uint8_t rk[16]; + uint32_t rk[4]; + uint8_t *rkptr = (uint8_t *) rk; - create_round_key(key->data + 16 * key->nbr, rk); - add_round_key(state, rk); + create_round_key(key->data + 16 * key->nbr, rkptr); + add_round_key(state, rkptr); for (i = key->nbr - 1; i > 0; i--) { - create_round_key(key->data + 16 * i, rk); + create_round_key(key->data + 16 * i, rkptr); shift_rows_inv(state); - add_round_key(state, rk); + add_round_key(state, rkptr); mix_columns_inv(state); } - create_round_key(key->data, rk); + create_round_key(key->data, rkptr); shift_rows_inv(state); - add_round_key(state, rk); + add_round_key(state, rkptr); } /* Set the block values, for the block: @@ -405,26 +407,28 @@ static void aes_main_inv(aes_key *key, u void cryptonite_aes_generic_encrypt_block(aes_block *output, aes_key *key, aes_block *input) { - uint8_t block[16]; - uint8_t *iptr, *optr; + uint32_t block[4]; + uint8_t *iptr, *optr, *bptr; iptr = (uint8_t *) input; optr = (uint8_t *) output; - swap_block(block, iptr); - aes_main(key, block); - swap_block(optr, block); + bptr = (uint8_t *) block; + swap_block(bptr, iptr); + aes_main(key, bptr); + swap_block(optr, bptr); } void cryptonite_aes_generic_decrypt_block(aes_block *output, aes_key *key, aes_block *input) { - uint8_t block[16]; - uint8_t *iptr, *optr; + uint32_t block[4]; + uint8_t *iptr, *optr, *bptr; iptr = (uint8_t *) input; optr = (uint8_t *) output; - swap_block(block, iptr); - aes_main_inv(key, block); - swap_block(optr, block); + bptr = (uint8_t *) block; + swap_block(bptr, iptr); + aes_main_inv(key, bptr); + swap_block(optr, bptr); } void cryptonite_aes_generic_init(aes_key *key, uint8_t *origkey, uint8_t size) --- a/cbits/cryptonite_sha512.c +++ b/cbits/cryptonite_sha512.c @@ -24,6 +24,7 @@ #include #include "cryptonite_bitfn.h" +#include "cryptonite_align.h" #include "cryptonite_sha512.h" void cryptonite_sha384_init(struct sha512_ctx *ctx) @@ -153,9 +154,18 @@ void cryptonite_sha512_update(struct sha index = 0; } - /* process as much 128-block as possible */ - for (; len >= 128; len -= 128, data += 128) - sha512_do_chunk(ctx, (uint64_t *) data); + if (need_alignment(data, 8)) { + uint64_t tramp[16]; + ASSERT_ALIGNMENT(tramp, 8); + for (; len >= 128; len -= 128, data += 128) { + memcpy(tramp, data, 128); + sha512_do_chunk(ctx, tramp); + } + } else { + /* process as much 128-block as possible */ + for (; len >= 128; len -= 128, data += 128) + sha512_do_chunk(ctx, (uint64_t *) data); + } /* append data into buf */ if (len) @@ -175,7 +185,6 @@ void cryptonite_sha512_finalize(struct s static uint8_t padding[128] = { 0x80, }; uint32_t i, index, padlen; uint64_t bits[2]; - uint64_t *p = (uint64_t *) out; /* cpu -> big endian */ bits[0] = cpu_to_be64((ctx->sz[1] << 3 | ctx->sz[0] >> 61)); @@ -191,7 +200,7 @@ void cryptonite_sha512_finalize(struct s /* store to digest */ for (i = 0; i < 8; i++) - p[i] = cpu_to_be64(ctx->h[i]); + store_be64(out+8*i, ctx->h[i]); } #include