/******************************************************************************
* teta.h *
* *
* ASCII text stream integrated encryptor - authenticator *
* in EtA (Encrypt-then-Authenticate) mode *
* ** This program is only a DEMO of cryptographic algorithms. *
* ** Do NOT use it for serious purposes. *
******************************************************************************/
// see teta-docs.txt for problem specification and algorithm description
/*#############################################################################
# #
# #
# SPEC #
# #
# #
#############################################################################*/
#pragma once
// msvc specifics
#if defined(_MSC_VER)
#define __MSC__
// Disable throw, precedence, size_t->int, ptrdiff->u32 warnings
#pragma warning (disable: 4290 4554 4267)
#endif
#ifdef __MSC__
#include<cassert>
#include<ctime>
#endif
#include<stdio.h>
#include<stdlib.h>
#include<string>
#include<iostream>
#include<fstream>
#include<iomanip>
#include<vector>
#include<map>
#include<set>
#include<algorithm>
#if defined( __INTEL_COMPILER )
#include<mathimf.h>
#elif defined( _MSC_VER )
#include <cmath>
#define round(x) double(int((x)))
#else
#include<cmath>
#endif
using namespace std;
////////////////////////////////////////////////////////////////////////////////
// Basics
typedef unsigned char U8; // PROBLEM: char may have more than 8 bits
typedef unsigned short U16; // PROBLEM: short may have more than 16 bits
typedef unsigned long U32; // PROBLEM: long may have more than 32 bits
typedef unsigned long long U64; // PROBLEM: long long may have more than 64 bits
#define V64LEN 2 // 32-bit words
#define V96LEN 3 // 32-bit words
#define V128LEN 4 // 32-bit words
#define V160LEN 5 // 32-bit words
#define V256LEN 8 // 32-bit words
#define V320LEN 10 // 32-bit words
#define V1024LEN 32 // 32-bit words
#define V2176LEN 68 // 32-bit words
#define V4608LEN 144 // 32-bit words
#define V3200LEN 100 // 32-bit words
typedef U32 V64 [V64LEN];
typedef U32 V96 [V96LEN];
typedef U32 V128 [V128LEN];
typedef U32 V160 [V160LEN];
typedef U32 V256 [V256LEN];
typedef U32 V320 [V320LEN];
typedef U32 V1024[V1024LEN];
typedef U32 V2176[V2176LEN];
typedef U32 V4608[V4608LEN];
typedef U32 V3200[V3200LEN];
#define BASE 100
inline U32 lo32 (U64 x) { return U32(x & 0xffffffff); }
inline U32 hi32 (U64 x) { return U32(x >> 32); }
inline U64 mk64 (U32 L, U32 H) { return (U64(H)<<32)|U64(L); }
inline U32 byteswap ( U32 x )
{ U32 u = ((x & 0xFF00FF00) >> 8) | (( x & 0x00FF00FF) << 8);
return (u << 16) | (u >> 16); }
void byteswap( U32 y[], U32 const x[], int len );
// reverse bytes of x and store result to y
// e.g. if x == 0x000102030405060708090a0b, then byteswap( y,x,3 ) yields
// y == 0x0b0a09080706050403020100
// gcc 3.x translates std::_rotl(), std::_rotr() to library function calls
// instead of machine opcodes rol and ror => catastrophic performance.
#ifdef __GNUC__
#define _rotl(x, r) ( ((x) << (r))|((x) >> (32-(r))) )
#define _rotr(x, r) ( ((x) >> (r))|((x) << (32-(r))) )
#endif
// _rotl(), _rotl() are only 32 bit, but we need 64-bit version too
inline U64 rotl64 (U64 x, int r) { return (x << r)|(x >> (64-r)); }
inline U64 rotr64 (U64 x, int r) { return (x >> r)|(x << (64-r)); }
inline double sqr (double x) { return x*x; }
#define LOG2 0.69314718055994530941723212145818 // log(2)
#define ILOG2 1.4426950408889634073599246810019 // 1/log(2)
inline double log2 (double x) { return log(x)*ILOG2; } // logarithm of x in base 2
// Conventional naming:
//
// X,Y Input, output block of the same size. For a block cipher, X is plain
// block, Y is cipherblock. For a block cipher-based hash function, X
// is the previous state of hash value, Y is the new state of hash value.
//
// Z Input block that is usually longer than of X,Y.
// For a block cipher it is primary key. For block cipher-based hash function
// it is the hashed data.
//
// S Input block, used by the keyed hash function uhash64 as the primary key.
// It changes every block hashed.
//
// T Expanded key, ie. array of subkeys. For uhash64 it is same type as Z.
//
// U Input (plain) stream of the b100 stream encryptor and authenticator.
//
// V Output (cipher) stream of the b100 stream encryptor.
//
// W 2nd input (key) stream of the B100 stream encryptor and authenticator.
void crypt128setup (); // constructs subsitution tables for
// the reference implementation and
// optimized implementation(s) of crypt128
// crypt128--reference implementation
void crypt128encryptYXZ_ref (V128& Y, V128 const& X, V160 const& Z);
// encrypt X to Y using key Z
void crypt128decryptYXZ_ref (V128& Y, V128 const& X, V160 const& Z);
// decrypt X to Y using key Z
// crypt128--optimized implementation
void crypt128expandTZ (V2176& T, V160 const& Z);
// expands key Z into subkeys T
void crypt128encryptYXT_opt (V128& Y, V128 const& X, V2176 const& T);
// encrypt X to Y using expanded key T
// crypt160--reference implementation
void crypt160expandTZ (V4608& T, V256 const& Z);
// expands key Z into subkeys T
void crypt160encryptYXT_ref (V160& Y, V160 const& X, V4608 const& T);
// encrypt X to Y using expanded key T
// crypt160--optimized implementations
void crypt160encryptYXT_opt1 (V160& Y, V160 const& X, V4608 const& T);
// encrypt X to Y using expanded key T
void crypt160encryptYXT_opt2 (V160& Y, V160 const& X, V4608 const& T);
// encrypt X to Y using expanded key T
inline
void crypt160encryptYXT (V160& Y, V160 const& X, V4608 const& T)
{ // encrypts X to Y using subkeys T
crypt160encryptYXT_opt2 ( Y, X, T );
}
void uhash160expandTS (V320 &T, V160 const& S);
// expands key S into subkeys T, **updates** T
void uhash160hashYZT (V160& Y, V256 const& Z, V320 const& T);
// hash Z to Y from previous hash Y using subkeys T,
// ** updates ** Y
void b100h64expandTZ (V3200& T, V256 const& Z);
// expands key Z into subkeys T
void b100h64hashYXUWT (V64& Y, V64 const& X, U8 const U, U32 const W2, V3200 const& T);
// hash U to Y from previous hash X using 2 consecutive
// chars (W2) of a not necessarily secret random stream
// and a table of secret subkeys T[]
inline U32 q32 (U32 x) {return x*(2*x+1);} // used by crypt160
inline U64 q64 (U64 x) {return x*(2*x+1);} // used by crypt64
inline U32 p32 (U32 x, U32 y) {return y*(2*x+1) + x;} // used by b100h64
inline U32 f32 (U32 x, U32 y) {return y*(2*x+0x80000001) + x;} // unused?
inline U64 p64 (U64 x, U64 y) {return y*(2*x+1) + x;} // unused ?
U64 c128hash64 (V160 const& Z, U64 salt);
// Using 160-bit key Z, encrypts 64-bit salt (zero-padded to 128-bit),
// returns 64-bit low half of the cipherblock.
U64 muluh_shift (U64 n, U64 m, int sh1,int sh2);
// Computes (m*(n>>sh1)) >> (sh2+64), a particular case
// of Granlund-Montgomery formula
// to perform unsigned division by multiplication
class Hash160; // 160-bit cryptographic hash function
class B100prng; // Base100 keystream generator
class B100mac; // Base100 stream MAC calculator
class B100streamAuthEncryptor; // Integrated base100 stream en-/de-cryptor
// and MAC calculator/verifier in an EtA mode;
// Integrated plainstream substitutor, X en-/de-coder,
// cipherstream en-/de-formatter and Y en-/de-coder
class Keybase; // Manages keys, provides key-related services
class Controller; // Controls user I/O
////////////////////////////////////////////////////////////////////////////////
// Template instantiations
#include "teta-tmpl.h"
template class UsgRng<U8,C160toU8buffer>;
typedef UsgRng<U8,C160toU8buffer> U8prng;
template class UsgRng<U16,C160toU16buffer>;
typedef UsgRng<U16,C160toU16buffer> U16prng;
template class UsgRng<U32>;
typedef UsgRng<U32> U32prng;
template class UsgRng<U64>;
typedef UsgRng<U64> U64prng;
template class UsgRng<U32,C128toU32buffer,Crypt128prng>;
typedef UsgRng<U32,C128toU32buffer,Crypt128prng> U32cipher;
typedef U32prng U32trng;
////////////////////////////////////////////////////////////////////////////////
class B100prng : protected U32cipher
// a template Bxxprng<> would be more general, but much
// more complicated, with bitstream splitting etc.
{
public:
U8 randB100 (); // Returns pseudo-random b100 char (i.e. of 0..99)
U32 rand4B100 (); // Returns 4 consecutive chars of the pseudo-random
// b100 stream in a 4-byte sliding buffer,
// in low-first order, i.e. if the stream is
// a1 a2 a3 a4 a5... then:
// First call returns a1 xx xx xx (xx = garbage)
// Second call returns a2 a1 xx xx
// Third call returns a3 a2 a1 xx
// Fourth call returns a4 a3 a2 a1
// Fifth call returns a5 a4 a3 a2
// k-th call returns a[k] a[k-1] a[k-2] a[k-3]
// Usage:
// This class is used specifically by B100streamAuthEncryptor to
// encrypt and to feed the B100mac algorithm with upto 4 keystreams
// (128-bit MAC).
// Encrypt-only and 32-bit MAC mode may use randB100() to save ~2 cc/char.
// 64-, 96-, 128-bit MAC modes must use rand4B100().
// Use either randB100() or rand4B100(). If both are needed, create two
// B100prng instances!
U8 operator() () {return randB100();}
// only one constructor is provided because this class is used only via
// protected inheritance by B100streamAuthEncryptor, which does not have
// enough infos at its construction time to call more meaningful base class
// constructors.
B100prng();
void seed (V160 const& z, U64 iv);
// TODO: consider adding more constructors and variants of seed()
static
void test ();
private:
static const int NDIGITS =3; // N,
static const int NBITS =20; // K,
static const U64 MAXMULT =1000000; // M, described in 'Algorithms'.
U64 _bitbuf; // (Secondary) buffer of bits
int _bitbuflen; // Number of bits remaining in it
U32 _digbuf; // (Tertiary) buffer of B-ary digits, 0..M-1
int _digbuflen; // Number of digits remaining in it, 0..N
U32 _o4digbuf; // (Quaternary) sliding buffer of 4 consecutive output
// chars for rand4B100()
public: // The following should be private. They are made public for testing only
U32 randU20 (); // Splits the 160-bit output of the underlying prng
// in 20-bit nibbles and returns them, one by one,
// in low-first order
};
////////////////////////////////////////////////////////////////////////////////
class B100mac
{
public:
void initialize (V160 const& z);// Initialize with primary key z
void hashB100 (U8 u, U32 w2); // Hash u using w2 (2 chars) from a
// keystream
void finalize (U64 nonce); // Computes final MAC and store it
// to another internal variable that can
// be read later by getMAC(). MACing may
// continue even after finalize().
U64 getMac () const; // Returns the MAC value
protected:
// only one constructor is provided because this class is used only via
// protected inheritance by B100streamAuthEncryptor, which does not have
// enough infos at its construction time to call more meaningful base class
// constructors.
B100mac();
private:
V3200 _tkey; // array of 100 subkeys derived from the key, used as
// the T parameter to b100h64hash().
V2176 _pdfxkey; // expanded key of the pad derivation function
V64 _hash; // Current hash value
V64 _mac; // Final MAC value
};
////////////////////////////////////////////////////////////////////////////////
class B100streamAuthEncryptor: protected B100prng, protected B100mac
{
// Reemphasize: The keystream for this EtA class is seeded by a 32-bit nonce,
// uses an internal 32-bit counter. The cipherstream is MACed by a 32-bit
// nonce and produces 64-bit MAC with forge probability of 2**-50 (regardless
// of message length). So this class may be used to encrypt + authenticate
// upto 2**32 messages, each upto 2**32 chars long with the max forgeability
// of 2**-18.
public:
B100streamAuthEncryptor(Controller *c, Keybase *kb);
// Plainstream *should* be open in binary mode because only binary mode
// guarantee decrypt(encrypt(s)) == s exactly regardless of environments
// where decrypt() and encrypt() are executed. In text mode, the equality
// is guaranteed only if encrypt() and decrypt() are executed in the same
// environment.
//
// Cipherstream may be open in either mode.
void encrypt (istream& ps, ostream& cs);
void decrypt (istream& cs, ostream& ps);
static void test ();
private:
Controller *_controller; // The assoc. controller. Call it if needed
Keybase *_keybase; // The assoc. keybase. Call it if needed
/*
The substitution process shall be represented by a function P->X, the process
of deformatting the 'ciphertext' of a 'ciphertext_file' shall be presented by a
function C->Y:
(1) Each Pchar/Cchar that is an Xchar/Ychar maps to itself.
(2) Each Pchar/Cchar that is not an Xchar/Ychar maps to:
(2a) a Xchar/Ychar substitute, i.e. Xchar/Ychar in 100 chars above; or
(2b) NUL, if it is to be deleted in the encoded Xchar/Ychar text
before encoding to B100.
(3) Nothing of plaintext may be ignored (deleted), everything, even NUL,
must survive in the encoded Xchar text as itself or as a Xchar substitute.
Notes
- Xchar contains DEL so a char maps to DEL if it does not have a meaningful
substitute, by spec and by rule (3).
- DEL maps to itself in P->X by rule (1), and maps to NUL in C->Y by rule (2b) .
- NUL maps to DEL in P->X by rule (3), and maps to itself in C->Y by rule (2b).
- Any other char of the ciphertext, including any format_effector and
logcomctrl_char which were inserted by the formatter, shall be deleted by the
deformatter before encoding to Ychar. (This is not a rule, just a consequence
of the fundamental requirement that ciphertext must be decryptable.) Therefore,
every such char maps to NUL in C->Y.
- The range of the function P->X contains exactly 100 chars; the range of
the function C->Y contains exactly 100 Ychars + NUL == 101 chars.
- The function P->X will be used to implement (as a single array) an
automated
.P->B100, i.e. substitutor+Xencoder,
.B100->P, i.e. Xdecoder.
- The deformatter's function will be used to implement (as a single array)
an automated
.C->B100, i.e. deformatter's ciphertext-processing part + Yencoder
.B100->C, i.e. Ydecoder + formatter's ciphertext-generating part.
*/
static void constructCode (U8(*PCtoXY)(U8),U8 PCtoB100[256], U8 B100toPC[BASE+1]);
static U8 ansiToX (U8 c); // substitutor's map P -> X
static U8 ansiToY (U8 c); // deformatter's map C -> Y
U8 encryptChar (U8 x); // return encrypted char or NUL
U8 decryptChar (U8 y); // return decrypted char or NUL
// the following should be private. They are public only for testing
public:
void encryptString (char *cs, char const* ps, int len);
void decryptString (string& p, string const& c);
using B100prng::seed;
};
////////////////////////////////////////////////////////////////////////////////
class Hash160
{
public:
Hash160();
void initialize (U64 salt);
// The salt is used primarily as the key for uhash160.
void hashBlock (V256 const& Z, int len=sizeof(V256));
// hash block Z into internal variable
// although there is parameter 'len', Z must be a
// zero-padded 256-bit data block even if len<32.
void finalize ();
// computes final hash value and store it to another
// internal variable, which can be read by getHash().
// Hashing may continue even after finalize().
V160 const& getHash () const;
// returns the final hash value
V160 const& operator()(void const *z, int len, U64 salt = 0);
// hash byte string z of 'len' bytes using 'salt' and return
// the hash value. Example of use:
// char s[1000];
// ...
// Hash64 hash;
// V160 h = hash(s,sizeof(s),time(NULL));
protected:
V160 _chash; // current hash val of the underlying crypt160
V160 _uhash; // current hash val of the accompanying uhash160
V160 _hash; // final (combined) hash value
size_t _datalen; // number of bytes hashed so far
V4608 _crypt160xkey; // expanded key of the underlying crypt160...
V320 _uhash160xkey; // ... and the accompanying uhash160
};
////////////////////////////////////////////////////////////////////////////////
struct Keyinfo
{
V160 key;
U64 keyid; // The hash value of the key
U64 salt; // The salt that was used to hash the key
U64 nUsed; // Number of times the key is used to encrypt/create
// MAC. This number is used as the nonce for the next
// message authenticated under the key, or as the seed
// for the next message encrypted under the key
bool isActive; // Keys are never deleted and may be always used
// to decrypt/verify mac. Furthermore, when a key is
// _active_ it can be used to encrypt/create mac.
};
////////////////////////////////////////////////////////////////////////////////
class Keybase
{
public:
explicit
Keybase (string const& keyfilename); // Load keybase from file
~Keybase (); // Save this keybase to the original file
Keyinfo const * selectKey (U64 const& keyid) const;
// Select certain key to decrypt / verify MAC.
// Return NULL if such key not found in this keybase
Keyinfo obtainKey ();
// Obtain a suitable key to encrypt / create MAC.
// If this keybase finds no suitable key, it generates one.
// If it finds out a key, it increment the seed and the nonce
// before returning the key info.
static void test();
private:
vector<Keyinfo> _keyinfos;
map<U64,int> _keyidx;
string _keyfilename;
Keyinfo const& genKey (); // generate a key and add it to _keyinfos
// returns reference to the just-added key
void loadKeys (string const& keyfilename);
void saveKeys (string const& keyfilename);
};
////////////////////////////////////////////////////////////////////////////////
class Controller
{
public:
Controller();
~Controller ();
};
////////////////////////////////////////////////////////////////////////////////
// Statistical & DSP stuffs
struct Statistic
{
Statistic(vector<double> const& data);
string image(void);
int _n;
double _mu;
double _sigma;
double _max, _min;
};
double quartile (double p, vector<double> const& sorted_data);
// Determines p-quartile (octile, decile,...) of the
// sorted_data, where sorted_data is viewed as a
// multiset of floats; it must be already sorted.
// Example: quartile(0.25, v), is a float q dividing v into 3 partitions
// v0,v01,v1 concisting of floats smaller than q, equaling to q and
// larger than q respectively, such that |v0| <= 0.25 |v| and |v1| <= 0.75 |v|.
// If v01 == empty, then every number in (max v0, min v1) can be the q.
void haardwt (int y[], int const x[], int len);
// Haar discrete wavelet transform of signal x[] to spectrum y[],
// len must be 2**n
double entropy ( map<int,int> & f );
// (Shannon) entropy [bits] of **each** symbol given its frequency
// distribution, f[]
double entropy ( int const s[], int len );
// entropy [bits] of the array 's' of length 'len'
double entropy ( U8 ss[], int len );
// entropy [bits] of the signal sample 'ss' of length 'len',
// consisting of 2nd differential of the usclock() values,
// i.e. jitters of latency of usclock().
////////////////////////////////////////////////////////////////////////////////
// Microsecond clock
//
// Implemented by high-resolution performance counter (HRPC)
// which is available on Win32 only
#include<windows.h>
//------------------------------------------------------------------------------
extern double HRPC_FRQ; // [Hz] HRPC frequency
extern double HRPC_PRD; // [s] HRPC period,ie. span of 1 tick
extern double MACH_FRQ; // [Hz] Machine (CPU) frequency
extern double MACH_PRD; // [s] Machine (CPU) period, ie. span of 1 cc
extern double CC_PER_TICK; // [cc/ct] #CPU cc per HRPC tick
extern double UCLK_LATENCY; // [s] (mean of) uclock latency
LONGLONG usclock (); // [ct] current value of the HRPC counter
void hrpcInit(); // initialization, call it once at the begining
void rampup ( float t );// Ramping up for about t seconds.
// There're many reasons to do so, including:
// - initial page faults,
// - initial cache misses,
// - initial CPU's frequency which may not be the full MACH_FRQ.
////////////////////////////////////////////////////////////////////////////////
// Global variables
extern U32trng g_u32trng;
extern U8prng g_u8prng;
extern U16prng g_u16prng;
extern U32prng g_u32prng;
extern U64prng g_u64prng;
extern Crypt128prng g_crypt128prng;
extern Crypt160prng g_crypt160prng;
extern U8 const P_AESsbox [256];
extern U8 const P_affineSqrt2[256];
extern U8 const P_affineSqrt3[256];
////////////////////////////////////////////////////////////////////////////////
// Inlines
//------------------------------------------------------------------------------
inline U32 B100prng::randU20() // Inline saves only 1-2 cc/char and even makes it slower
// in some previous versions. Be careful!
{
// Performance: brutto 70 cc/char
// (each U20 is equivalent to 3 chars)
if(_bitbuflen < NBITS)
// if _bitbuf is U64, it can load 32 bits once so if(...) will suffice.
// But if _bitbuf was U32 then it could load 8 bits once and there would
// be while(...).
{
// dequeue a U32 from the low end of the primary buffer and enqueue the
// 32 bits to the high end of _bitbuf
_bitbuf |= U64(U32cipher::rand()) << _bitbuflen;
_bitbuflen += 32;
}
// dequeue 20 bits from the low end of _bitbuf
_bitbuflen -= NBITS;
U32 r = U32( _bitbuf ) & ((1<<NBITS)-1);
_bitbuf >>= NBITS;
return r;
}
//------------------------------------------------------------------------------
inline
U8 B100prng::randB100()
{
// Performance:
//
// - The result is extremely sensitive to benchmark parameters,
// i.e. number of repetitions etc. Use consistent parameters or the results
// won't be comparable!
//
// - All benchmarks are made with crypt128encryptYXT_opt().
//
// - The unit is [cc/char]. Throughput of crypt128 must be converted from
// cc/byte to cc/char; e.g. 73.5 cc/byte is equivalent to
// 73.5*(20/8)*2**20/(3*1E6) = 73.5*0.874 = 64 cc/char
//
// Item ICC MSVC GCC
// ========================================================== ==== ==== ====
// 1. Unoptimized: 1 div + 1 mod
// brutto: ---- ---- ----
// 2. Optimized: 2 mul
// brutto: 86 ---- 106
// crypt128, converted 64 80
// netto: 22 ---- 26
//
// => For 32-bit arithmetic MSVC, GCC may have already optimized code.
// The trick of replacing div-mod with multiplication is effective with 64-bit
// arithmetics only
if(!_digbuflen)
{
while((_digbuf = randU20()) >= MAXMULT);
_digbuflen = NDIGITS;
}
//U32 d = _digbuf / BASE; // unoptimized
U32 d = hi32(U64(_digbuf)*0x51EB851F) >> 5; // optimized
// The following line is to divide by 100 using 64-bit arithmetic.
// Do NOT delete! Reserved for future use:
// U64 d= muluh_shift(_buffer,2951479051793528259ULL,2,2); // optimized
//U32 c = _digbuf % BASE; // unoptimized
U32 c = _digbuf - d*BASE; // half-optimized
_digbuf = d;
_digbuflen--;
return U8(c);
}
//------------------------------------------------------------------------------
inline
U32 B100prng::rand4B100()
{
// Performance == randB100() + 2 cc/char
if(!_digbuflen)
{
while((_digbuf = randU20()) >= MAXMULT);
_digbuflen = NDIGITS;
}
//U32 d = _digbuf / BASE; // unoptimized
U32 d = hi32(U64(_digbuf)*0x51EB851F) >> 5; // optimized
//U32 c = _digbuf % BASE; // unoptimized
U32 c = _digbuf - d*BASE; // half-optimized
_digbuf = d;
_digbuflen--;
_o4digbuf = (_o4digbuf >> 8) | (c << 24);
return _o4digbuf;
}
//------------------------------------------------------------------------------
inline
void b100h64hashYXUWT (V64& Y, V64 const& X, U8 const U, U32 const W2,
V3200 const& T)
// hash U to Y from previous hash X using 2 bytes W2
// from a random stream and a table of subkeys T[]
{
Y[0] = p32(X[0] + U32(U),T[ W2 &0xFF]);
Y[1] = p32(X[1] + U32(U),T[(W2>> 8)&0xFF]);
}
//------------------------------------------------------------------------------
inline
void B100mac::hashB100 ( U8 u, U32 w2 )
{
b100h64hashYXUWT ( _hash, _hash, u, w2, _tkey );
}