Commit b8a398d3 authored by David Monniaux's avatar David Monniaux
Browse files
parent db17026d
all:
$(CC) -Wall -O2 -D__BSTEA_MAIN_ -o bstea_test bstea_test.c bstea.c
clean:
rm -f bstea_test
Bit-sliced TEA encryption
from Alfonso De Gregorio, 06 December 2010
https://web.archive.org/web/20131021214351/http://plaintext.crypto.lo.gy/article/378/untwisted-bit-sliced-tea-time
#include <stdint.h>
#include <stdlib.h>
#include <limits.h>
#include "bstea.h"
/* a key schedule constant - 32/golden-ratio */
static const uint32_t delta = 0x9e3779b9;
/* v points to the wordsize-way vectorized plaintext,
* k to the vectorized key */
/* input quantities are disposed in the following way:
v0 <- v[0..31] k0 <- k[0..31] k2 <- k[64..95]
v1 <- v[32..63] k1 <- k[32..63] k3 <- k[96..127]
*/
void encrypt(parallel_blocks_t v, const parallel_keys_t k, unsigned int r)
{
/* Stride 32 between consecutive words in input quantities */
# define offset_v0 0
# define offset_v1 32
# define offset_k0 0
# define offset_k1 32
# define offset_k2 64
# define offset_k3 96
vector_width_t carry;
vector_width_t axorb;
vector_width_t aandb;
vector_width_t ai;
vector_width_t bi;
vector_width_t borrow;
vector_width_t notaandb;
vector_width_t v1_lshift_4[32];
vector_width_t v1_plus_sum[32]; /* term two */
vector_width_t v1_rshift_5[32];
vector_width_t v1_lshift_4_plus_k0[32]; /* term one */
vector_width_t v1_rshift_5_plus_k1[32]; /* term three */
vector_width_t v0_lshift_4[32];
vector_width_t v0_plus_sum[32]; /* term two */
vector_width_t v0_rshift_5[32];
vector_width_t v0_lshift_4_plus_k2[32]; /* term one */
vector_width_t v0_rshift_5_plus_k3[32]; /* term three */
vector_width_t shift;
int i;
/* setup */
uint32_t sum = 0;
for (i = 0; i < 32; ++i)
v1_lshift_4[i] = v1_plus_sum[i] = v1_rshift_5[i] = \
v1_lshift_4_plus_k0[i] = v1_rshift_5_plus_k1[i] = \
v0_lshift_4[i] = v0_plus_sum[i] = v0_rshift_5[i] = \
v0_lshift_4_plus_k2[i] = v0_rshift_5_plus_k3[i] = 0;
while (r > 0) {
sum += delta;
/* lshift v1 by 4 */
shift = 4;
for (i = 31; i >= 0; i--)
v1_lshift_4[i] = (i >= shift) ? v[offset_v1 + i - shift] : 0;
/* add k0 to v1_lshift_4 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v1_lshift_4[i];
bi = k[offset_k0 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v1_lshift_4_plus_k0[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* add delta sum to v1 */
carry = 0;
for (i = 0;i < 32;++i) {
/* VECTOR_AT_ONE where the ith bit of the sum is set */
/*
* Each iteration follows the first 32 elements
* in the expansion of multiples of 32/golden-ratio,
* or 32/(1+sqrt(5)/2
*/
ai = (sum & (1<<i)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO;
bi = v[offset_v1 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v1_plus_sum[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* rshift v1 by 5 */
shift = 5;
for (i = 0; i < 32; ++i)
v1_rshift_5[i] = (i < (32 - shift)) ? v[offset_v1 + i + shift] : 0;
/* add k1 to v1_rshift_5 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v1_rshift_5[i];
bi = k[offset_k1 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v1_rshift_5_plus_k1[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* xor the three terms and increment v0 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v1_lshift_4_plus_k0[i] ^ v1_plus_sum[i] ^ v1_rshift_5_plus_k1[i];
bi = v[offset_v0 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v[offset_v0 + i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* lshift v0 by 4 */
shift = 4;
for (i = 31; i >= 0; i--)
v0_lshift_4[i] = (i >= shift) ? v[offset_v0 + i - shift] : 0;
/* add k2 and v0_lshift_4 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v0_lshift_4[i];
bi = k[offset_k2 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v0_lshift_4_plus_k2[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* add delta sum to v0 */
carry = 0;
for (i = 0;i < 32;++i) {
/* VECTOR_AT_ONE where the ith bit of the sum is set */
ai = (sum & (1<<i)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO;
bi = v[offset_v0 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v0_plus_sum[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* rshift v0 by 5 */
shift = 5;
for (i = 0; i < 32; ++i)
v0_rshift_5[i] = (i < (32 - shift)) ? v[offset_v0 + i + shift] : 0;
/* add k3 to v0_rshift_5 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v0_rshift_5[i];
bi = k[offset_k3 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v0_rshift_5_plus_k3[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* xor the three terms and increment v1 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v0_lshift_4_plus_k2[i] ^ v0_plus_sum[i] ^ v0_rshift_5_plus_k3[i];
bi = v[offset_v1 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v[offset_v1 + i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
--r;
}
}
/* v points to the wordsize-way vectorized ciphertext,
* k to the vectorized key */
/* input quantities are disposed in the following way:
v0 <- v[0..31] k0 <- k[0..31] k2 <- k[64..95]
v1 <- v[32..63] k1 <- k[32..63] k3 <- k[96..127]
*/
void decrypt(parallel_blocks_t v, const parallel_keys_t k, unsigned int r)
{
# define offset_v0 0
# define offset_v1 32
# define offset_k0 0
# define offset_k1 32
# define offset_k2 64
# define offset_k3 96
vector_width_t carry;
vector_width_t axorb;
vector_width_t aandb;
vector_width_t ai;
vector_width_t bi;
vector_width_t borrow;
vector_width_t notaandb;
vector_width_t v1_lshift_4[32];
vector_width_t v1_plus_sum[32]; /* term two */
vector_width_t v1_rshift_5[32];
vector_width_t v1_lshift_4_plus_k0[32]; /* term one */
vector_width_t v1_rshift_5_plus_k1[32]; /* term three */
vector_width_t v0_lshift_4[32];
vector_width_t v0_plus_sum[32]; /* term two */
vector_width_t v0_rshift_5[32];
vector_width_t v0_lshift_4_plus_k2[32]; /* term one */
vector_width_t v0_rshift_5_plus_k3[32]; /* term three */
vector_width_t shift;
int i;
/* setup */
uint32_t sum = delta * r;
for (i = 0; i < 32; ++i)
v1_lshift_4[i] = v1_plus_sum[i] = v1_rshift_5[i] = \
v1_lshift_4_plus_k0[i] = v1_rshift_5_plus_k1[i] = \
v0_lshift_4[i] = v0_plus_sum[i] = v0_rshift_5[i] = \
v0_lshift_4_plus_k2[i] = v0_rshift_5_plus_k3[i] = 0;
while (r > 0) {
/* lshift v0 by 4 */
shift = 4;
for (i = 31; i >= 0; i--)
v0_lshift_4[i] = (i >= shift) ? v[offset_v0 + i - shift] : 0;
/* add k2 and v0_lshift_4 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v0_lshift_4[i];
bi = k[offset_k2 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v0_lshift_4_plus_k2[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* add delta sum to v0 */
carry = 0;
for (i = 0;i < 32;++i) {
/* VECTOR_AT_ONE where the ith bit of the sum is set */
ai = (sum & (1<<i)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO;
bi = v[offset_v0 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v0_plus_sum[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* rshift v0 by 5 */
shift = 5;
for (i = 0; i < 32; ++i)
v0_rshift_5[i] = (i < (32 - shift)) ? v[offset_v0 + i + shift] : 0;
/* add k3 to v0_rshift_5 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v0_rshift_5[i];
bi = k[offset_k3 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v0_rshift_5_plus_k3[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* xor the three terms and decrement v1 */
borrow = 0;
for (i = 0;i < 32;++i) {
ai = v[offset_v1 + i];
bi = v0_lshift_4_plus_k2[i] ^ v0_plus_sum[i] ^ v0_rshift_5_plus_k3[i];
notaandb = (ai ^ VECTOR_AT_ONE) & bi;
axorb = ai ^ bi;
v[offset_v1 + i] = axorb ^ borrow;
borrow = notaandb | ((ai ^ VECTOR_AT_ONE) & borrow) | (bi & borrow);
}
/* lshift v1 by 4 */
shift = 4;
for (i = 31; i >= 0; i--)
v1_lshift_4[i] = (i >= shift) ? v[offset_v1 + i - shift] : 0;
/* add k0 to v1_lshift_4 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v1_lshift_4[i];
bi = k[offset_k0 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v1_lshift_4_plus_k0[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* add delta sum to v1 */
carry = 0;
for (i = 0;i < 32;++i) {
/* VECTOR_AT_ONE where the ith bit of the sum is set */
ai = (sum & (1<<i)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO;
bi = v[offset_v1 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v1_plus_sum[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* rshift v1 by 5 */
shift = 5;
for (i = 0; i < 32; ++i)
v1_rshift_5[i] = (i < (32 - shift)) ? v[offset_v1 + i + shift] : 0;
/* add k1 to v1_rshift_5 */
carry = 0;
for (i = 0;i < 32;++i) {
ai = v1_rshift_5[i];
bi = k[offset_k1 + i];
aandb = ai & bi;
axorb = ai ^ bi;
v1_rshift_5_plus_k1[i] = axorb ^ carry;
carry &= axorb;
carry |= aandb;
}
/* xor the three terms and decrement v0 */
borrow = 0;
for (i = 0;i < 32;++i) {
ai = v[offset_v0 + i];
bi = v1_lshift_4_plus_k0[i] ^ v1_plus_sum[i] ^ v1_rshift_5_plus_k1[i];
notaandb = (ai ^ VECTOR_AT_ONE) & bi;
axorb = ai ^ bi;
v[offset_v0 + i] = axorb ^ borrow;
borrow = notaandb | ((ai ^ VECTOR_AT_ONE) & borrow) | (bi & borrow);
}
sum -= delta;
--r;
}
}
#ifndef __BSTEA_H
#define __BSTEA_H
#include <stdint.h>
#include <limits.h>
#include "bstea_wordsize.h"
#define TEA_ROUNDS 32
#define TEA_BLOCK_SIZE 64
#define TEA_KEY_SIZE 128
#if __BSTEA_WORDSIZE == 64
typedef uint64_t vector_width_t; /* 64-way bit-level vectorization */
#define VECTOR_AT_ONE 0xffffffffffffffff
#define VECTOR_AT_ZERO 0x0000000000000000
#elif __BSTEA_WORDSIZE == 32
typedef uint32_t vector_width_t; /* 32-way bit-level vectorization */
#define VECTOR_AT_ONE 0xffffffff
#define VECTOR_AT_ZERO 0x00000000
#elif __BSTEA_WORDSIZE == 16
typedef uint32_t vector_width_t; /* 16-way bit-level vectorization */
#define VECTOR_AT_ONE 0xffff
#define VECTOR_AT_ZERO 0x0000
#elif __BSTEA_WORDSIZE == 8
typedef uint32_t vector_width_t; /* 8-way bit-level vectorization */
#define VECTOR_AT_ONE 0xff
#define VECTOR_AT_ZERO 0x00
#else
typedef unsigned long int vector_width_t; /* word-way bit-level vectorization */
#define VECTOR_AT_ONE ULONG_MAX
#define VECTOR_AT_ZERO 0
#endif
typedef vector_width_t parallel_blocks_t[TEA_BLOCK_SIZE];
typedef vector_width_t parallel_keys_t[TEA_KEY_SIZE];
/* __P is a macro used to wrap function prototypes, so that compilers
that don't understand ANSI C prototypes still work, and ANSI C
compilers can issue warnings about type mismatches. */
#undef __P
#if defined (__STDC__) || defined (_AIX) || (defined (__mips) && defined (_SYSTYPE_SVR4)) || defined(WIN32) || defined(__cplusplus)
# define __P(protos) protos
#else
# define __P(protos) ()
#endif
/* __BEGIN_DECLS should be used at the beginning of your declarations,
so that C++ compilers don't mangle their names. Use __END_DECLS at
the end of C declarations. */
#undef __BEGIN_DECLS
#undef __END_DECLS
#ifdef __cplusplus
# define __BEGIN_DECLS extern "C" {
# define __END_DECLS }
#else
# define __BEGIN_DECLS /* empty */
# define __END_DECLS /* empty */
#endif
/* The following definitions for FAR are needed only for MSDOS mixed
* model programming (small or medium model with some far allocations).
* This was tested only with MSC. If you don't need the mixed model,
* just define FAR to be empty.
*/
#ifdef SYS16BIT
# if defined(M_I86SM) || defined(M_I86MM)
/* MSC small or medium model */
# define SMALL_MEDIUM
# ifdef _MSC_VER
# define FAR _far
# else
# define FAR far
# endif
# endif
# if (defined(__SMALL__) || defined(__MEDIUM__))
/* Turbo C small or medium model */
# define SMALL_MEDIUM
# ifdef __BORLANDC__
# define FAR _far
# else
# define FAR far
# endif
# endif
#endif
#if defined(WINDOWS) || defined(WIN32)
# ifdef BSTEA_DLL
# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
# ifdef BSTEA_INTERNAL
# define BSTEA_EXTERN extern __declspec(dllexport)
# else
# define BSTEA_EXTERN extern __declspec(dllimport)
# endif
# endif
# endif /* BSTEA_DLL */
/* If building or using bstea with the WINAPI/WINAPIV calling convention,
* define BSTEA_WINAPI.
* Caution: the standard BSTEA.DLL is NOT compiled using BSTEA_WINAPI.
*/
# ifdef BSTEA_WINAPI
# ifdef FAR
# undef FAR
# endif
# include <windows.h>
/* No need for _export, use BSTEA_LIB.DEF instead. */
/* For complete Windows compatibility, use WINAPI, not __stdcall. */
# define BSTEA_EXPORT WINAPI
# ifdef WIN32
# define BSTEA_EXPORTVA WINAPIV
# else
# define BSTEA_EXPORTVA FAR CDECL
# endif
# endif
#else
# include <stdbool.h>
#endif
#ifndef BSTEA_EXTERN
# define BSTEA_EXTERN extern
#endif
#ifndef BSTEA_EXPORT
# define BSTEA_EXPORT
#endif
#ifndef BSTEA_EXPORTVA
# define BSTEA_EXPORTVA
#endif
#ifndef FAR
# define FAR
#endif
__BEGIN_DECLS
BSTEA_EXTERN void encrypt __P((parallel_blocks_t, const parallel_keys_t, unsigned int));
BSTEA_EXTERN void decrypt __P((parallel_blocks_t, const parallel_keys_t, unsigned int));
__END_DECLS
#endif /* __BSTEA_H */
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include "bstea.h"
/* pack and unpack a single value all over the data path */
static void pack(uint32_t *v, size_t len, vector_width_t *bv) {
size_t i, p, offset = 0;
for (i=0; i<len; ++i, offset += 32)
for (p = 0; p < 32; ++p)
bv[offset + p] = (v[i] & (1<<p)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO;
}
static void unpack(vector_width_t *bv, int len, uint32_t *v) {
int i;
for (i=0; i<len; i++)
if (bv[i]) v[i>>5] |= 1<<(i%32);
}
/* pack and unpack one element at a time */
static void pack_elem(uint32_t *v, size_t len, vector_width_t *bv, int elem) {
size_t i, p, offset = 0;
for (i=0; i<len; ++i, offset += 32)
for (p = 0; p < 32; ++p)
bv[offset + p] |= (v[i] & (1<<p)) ? (1<<(elem)) : 0;
}
static void unpack_elem(vector_width_t *bv, int len, uint32_t *v, int elem) {
int i;
for (i=0; i<len; i++)
if (bv[i] & (1<<elem)) v[i>>5] |= 1<<(i%32);
}
typedef struct tvector_s {
uint32_t ptext[TEA_BLOCK_SIZE >> 5];
uint32_t ctext[TEA_BLOCK_SIZE >> 5];
uint32_t key[TEA_KEY_SIZE >> 5];
} tvector_t;
static void test_vectors() {
int i, j;
parallel_blocks_t v;
parallel_keys_t k;
uint32_t ctext[TEA_BLOCK_SIZE >> 5];
uint32_t ptext[TEA_BLOCK_SIZE >> 5];
uint32_t key[TEA_KEY_SIZE >> 5];
tvector_t testv [] = { { {0x00000000, 0x00000000}, \
{0x41ea3a0a, 0x94baa940}, \
{0x00000000, 0x00000000, \
0x00000000, 0x00000000} }, \
{ {0x74736574, 0x2e656d20}, \
{0x6a2a5d77, 0x0992cef6}, \
{0x6805022b, 0x76491406, \
0x260e5d77, 0x4378286c} }, \
{ {0x94baa940, 0x00000000}, \
{0x4e8e7829, 0x7d8236d8}, \
{0x00000000, 0x00000000, \
0x00000000, 0x41ea3a0a} }, \
{ {0x7d8236d8, 0x00000000}, \
{0xc88ba95e, 0xe7edac02}, \
{0x00000000, 0x00000000, \
0x41ea3a0a, 0x4e8e7829} } };
for (i = 0; i < sizeof(testv)/sizeof(tvector_t); ++i) {
for (j = 0;j < TEA_BLOCK_SIZE;++j) v[j] = 0;
for (j = 0;j < TEA_KEY_SIZE;++j) k[j] = 0;
(void) memset(&ctext, 0, 8);
(void) memset(&ptext, 0, 8);
(void) memset(&key, 0, 16);
pack(testv[i].ptext, 2,