From 4564a5d17b4c1a4507ac14dc96ad2bcaa675712c Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Tue, 7 Aug 2018 08:02:42 +0000
Subject: [PATCH 01/16] bulletproofs: speedup PROVE

---
 src/ringct/bulletproofs.cc | 122 ++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 64 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 381f50872..0e5b3b55f 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -127,15 +127,6 @@ static void sub_acc_p3(ge_p3 *acc_p3, const rct::key &point)
     ge_p1p1_to_p3(acc_p3, &p1);
 }
 
-static rct::key scalarmultKey(const ge_p3 &P, const rct::key &a)
-{
-  ge_p2 R;
-  ge_scalarmult(&R, a.bytes, &P);
-  rct::key aP;
-  ge_tobytes(aP.bytes, &R);
-  return aP;
-}
-
 static rct::key get_exponent(const rct::key &base, size_t idx)
 {
   static const std::string salt("bulletproof");
@@ -193,23 +184,28 @@ static rct::key vector_exponent(const rct::keyV &a, const rct::keyV &b)
 }
 
 /* Compute a custom vector-scalar commitment */
-static rct::key vector_exponent_custom(const rct::keyV &A, const rct::keyV &B, const rct::keyV &a, const rct::keyV &b)
+static rct::key cross_vector_exponent8(size_t size, const std::vector<ge_p3> &A, size_t Ao, const std::vector<ge_p3> &B, size_t Bo, const rct::keyV &a, size_t ao, const rct::keyV &b, size_t bo, const ge_p3 *extra_point, const rct::key *extra_scalar)
 {
-  CHECK_AND_ASSERT_THROW_MES(A.size() == B.size(), "Incompatible sizes of A and B");
-  CHECK_AND_ASSERT_THROW_MES(a.size() == b.size(), "Incompatible sizes of a and b");
-  CHECK_AND_ASSERT_THROW_MES(a.size() == A.size(), "Incompatible sizes of a and A");
-  CHECK_AND_ASSERT_THROW_MES(a.size() <= maxN*maxM, "Incompatible sizes of a and maxN");
+  CHECK_AND_ASSERT_THROW_MES(size + Ao <= A.size(), "Incompatible size for A");
+  CHECK_AND_ASSERT_THROW_MES(size + Bo <= B.size(), "Incompatible size for B");
+  CHECK_AND_ASSERT_THROW_MES(size + ao <= a.size(), "Incompatible size for a");
+  CHECK_AND_ASSERT_THROW_MES(size + bo <= b.size(), "Incompatible size for b");
+  CHECK_AND_ASSERT_THROW_MES(size <= maxN*maxM, "size is too large");
+  CHECK_AND_ASSERT_THROW_MES(!!extra_point == !!extra_scalar, "only one of extra point/scalar present");
 
   std::vector<MultiexpData> multiexp_data;
-  multiexp_data.reserve(a.size()*2);
-  for (size_t i = 0; i < a.size(); ++i)
+  multiexp_data.resize(size*2 + (!!extra_point));
+  for (size_t i = 0; i < size; ++i)
   {
-    multiexp_data.resize(multiexp_data.size() + 1);
-    multiexp_data.back().scalar = a[i];
-    CHECK_AND_ASSERT_THROW_MES(ge_frombytes_vartime(&multiexp_data.back().point, A[i].bytes) == 0, "ge_frombytes_vartime failed");
-    multiexp_data.resize(multiexp_data.size() + 1);
-    multiexp_data.back().scalar = b[i];
-    CHECK_AND_ASSERT_THROW_MES(ge_frombytes_vartime(&multiexp_data.back().point, B[i].bytes) == 0, "ge_frombytes_vartime failed");
+    sc_mul(multiexp_data[i*2].scalar.bytes, a[ao+i].bytes, INV_EIGHT.bytes);;
+    multiexp_data[i*2].point = A[Ao+i];
+    sc_mul(multiexp_data[i*2+1].scalar.bytes, b[bo+i].bytes, INV_EIGHT.bytes);
+    multiexp_data[i*2+1].point = B[Bo+i];
+  }
+  if (extra_point)
+  {
+    sc_mul(multiexp_data.back().scalar.bytes, extra_scalar->bytes, INV_EIGHT.bytes);
+    multiexp_data.back().point = *extra_point;
   }
   return multiexp(multiexp_data, false);
 }
@@ -273,16 +269,19 @@ static rct::keyV hadamard(const rct::keyV &a, const rct::keyV &b)
   return res;
 }
 
-/* Given two curvepoint arrays, construct the Hadamard product */
-static rct::keyV hadamard2(const rct::keyV &a, const rct::keyV &b)
+/* folds a curvepoint array using a two way scaled Hadamard product */
+static void hadamard_fold(std::vector<ge_p3> &v, const rct::key &a, const rct::key &b)
 {
-  CHECK_AND_ASSERT_THROW_MES(a.size() == b.size(), "Incompatible sizes of a and b");
-  rct::keyV res(a.size());
-  for (size_t i = 0; i < a.size(); ++i)
+  CHECK_AND_ASSERT_THROW_MES((v.size() & 1) == 0, "Vector size should be even");
+  const size_t sz = v.size() / 2;
+  for (size_t n = 0; n < sz; ++n)
   {
-    rct::addKeys(res[i], a[i], b[i]);
+    ge_dsmp c[2];
+    ge_dsm_precomp(c[0], &v[n]);
+    ge_dsm_precomp(c[1], &v[sz + n]);
+    ge_double_scalarmult_precomp_vartime2_p3(&v[n], a.bytes, c[0], b.bytes, c[1]);
   }
-  return res;
+  v.resize(sz);
 }
 
 /* Add two vectors */
@@ -326,17 +325,6 @@ static rct::keyV vector_dup(const rct::key &x, size_t N)
   return rct::keyV(N, x);
 }
 
-/* Exponentiate a curve vector by a scalar */
-static rct::keyV vector_scalar2(const rct::keyV &a, const rct::key &x)
-{
-  rct::keyV res(a.size());
-  for (size_t i = 0; i < a.size(); ++i)
-  {
-    rct::scalarmultKey(res[i], a[i], x);
-  }
-  return res;
-}
-
 /* Get the sum of a vector's elements */
 static rct::key vector_sum(const rct::keyV &a)
 {
@@ -620,16 +608,16 @@ try_again:
 
   // These are used in the inner product rounds
   size_t nprime = N;
-  rct::keyV Gprime(N);
-  rct::keyV Hprime(N);
+  std::vector<ge_p3> Gprime(N);
+  std::vector<ge_p3> Hprime(N);
   rct::keyV aprime(N);
   rct::keyV bprime(N);
   const rct::key yinv = invert(y);
   rct::key yinvpow = rct::identity();
   for (size_t i = 0; i < N; ++i)
   {
-    Gprime[i] = Gi[i];
-    Hprime[i] = scalarmultKey(Hi_p3[i], yinvpow);
+    Gprime[i] = Gi_p3[i];
+    ge_scalarmult_p3(&Hprime[i], yinvpow.bytes, &Hi_p3[i]);
     sc_mul(yinvpow.bytes, yinvpow.bytes, yinv.bytes);
     aprime[i] = l[i];
     bprime[i] = r[i];
@@ -652,14 +640,10 @@ try_again:
     rct::key cR = inner_product(slice(aprime, nprime, aprime.size()), slice(bprime, 0, nprime));
 
     // PAPER LINES 18-19
-    L[round] = vector_exponent_custom(slice(Gprime, nprime, Gprime.size()), slice(Hprime, 0, nprime), slice(aprime, 0, nprime), slice(bprime, nprime, bprime.size()));
     sc_mul(tmp.bytes, cL.bytes, x_ip.bytes);
-    rct::addKeys(L[round], L[round], rct::scalarmultH(tmp));
-    L[round] = rct::scalarmultKey(L[round], INV_EIGHT);
-    R[round] = vector_exponent_custom(slice(Gprime, 0, nprime), slice(Hprime, nprime, Hprime.size()), slice(aprime, nprime, aprime.size()), slice(bprime, 0, nprime));
+    L[round] = cross_vector_exponent8(nprime, Gprime, nprime, Hprime, 0, aprime, 0, bprime, nprime, &ge_p3_H, &tmp);
     sc_mul(tmp.bytes, cR.bytes, x_ip.bytes);
-    rct::addKeys(R[round], R[round], rct::scalarmultH(tmp));
-    R[round] = rct::scalarmultKey(R[round], INV_EIGHT);
+    R[round] = cross_vector_exponent8(nprime, Gprime, 0, Hprime, nprime, aprime, nprime, bprime, 0, &ge_p3_H, &tmp);
 
     // PAPER LINES 21-22
     w[round] = hash_cache_mash(hash_cache, L[round], R[round]);
@@ -672,8 +656,11 @@ try_again:
 
     // PAPER LINES 24-25
     const rct::key winv = invert(w[round]);
-    Gprime = hadamard2(vector_scalar2(slice(Gprime, 0, nprime), winv), vector_scalar2(slice(Gprime, nprime, Gprime.size()), w[round]));
-    Hprime = hadamard2(vector_scalar2(slice(Hprime, 0, nprime), w[round]), vector_scalar2(slice(Hprime, nprime, Hprime.size()), winv));
+    if (nprime > 1)
+    {
+      hadamard_fold(Gprime, winv, w[round]);
+      hadamard_fold(Hprime, w[round], winv);
+    }
 
     // PAPER LINES 28-29
     aprime = vector_add(vector_scalar(slice(aprime, 0, nprime), w[round]), vector_scalar(slice(aprime, nprime, aprime.size()), winv));
@@ -914,16 +901,16 @@ try_again:
 
   // These are used in the inner product rounds
   size_t nprime = MN;
-  rct::keyV Gprime(MN);
-  rct::keyV Hprime(MN);
+  std::vector<ge_p3> Gprime(MN);
+  std::vector<ge_p3> Hprime(MN);
   rct::keyV aprime(MN);
   rct::keyV bprime(MN);
   const rct::key yinv = invert(y);
   rct::key yinvpow = rct::identity();
   for (size_t i = 0; i < MN; ++i)
   {
-    Gprime[i] = Gi[i];
-    Hprime[i] = scalarmultKey(Hi_p3[i], yinvpow);
+    Gprime[i] = Gi_p3[i];
+    ge_scalarmult_p3(&Hprime[i], yinvpow.bytes, &Hi_p3[i]);
     sc_mul(yinvpow.bytes, yinvpow.bytes, yinv.bytes);
     aprime[i] = l[i];
     bprime[i] = r[i];
@@ -942,18 +929,18 @@ try_again:
     nprime /= 2;
 
     // PAPER LINES 16-17
+    PERF_TIMER_START_BP(PROVE_inner_product);
     rct::key cL = inner_product(slice(aprime, 0, nprime), slice(bprime, nprime, bprime.size()));
     rct::key cR = inner_product(slice(aprime, nprime, aprime.size()), slice(bprime, 0, nprime));
+    PERF_TIMER_STOP(PROVE_inner_product);
 
     // PAPER LINES 18-19
-    L[round] = vector_exponent_custom(slice(Gprime, nprime, Gprime.size()), slice(Hprime, 0, nprime), slice(aprime, 0, nprime), slice(bprime, nprime, bprime.size()));
+    PERF_TIMER_START_BP(PROVE_LR);
     sc_mul(tmp.bytes, cL.bytes, x_ip.bytes);
-    rct::addKeys(L[round], L[round], rct::scalarmultH(tmp));
-    L[round] = rct::scalarmultKey(L[round], INV_EIGHT);
-    R[round] = vector_exponent_custom(slice(Gprime, 0, nprime), slice(Hprime, nprime, Hprime.size()), slice(aprime, nprime, aprime.size()), slice(bprime, 0, nprime));
+    L[round] = cross_vector_exponent8(nprime, Gprime, nprime, Hprime, 0, aprime, 0, bprime, nprime, &ge_p3_H, &tmp);
     sc_mul(tmp.bytes, cR.bytes, x_ip.bytes);
-    rct::addKeys(R[round], R[round], rct::scalarmultH(tmp));
-    R[round] = rct::scalarmultKey(R[round], INV_EIGHT);
+    R[round] = cross_vector_exponent8(nprime, Gprime, 0, Hprime, nprime, aprime, nprime, bprime, 0, &ge_p3_H, &tmp);
+    PERF_TIMER_STOP(PROVE_LR);
 
     // PAPER LINES 21-22
     w[round] = hash_cache_mash(hash_cache, L[round], R[round]);
@@ -966,12 +953,19 @@ try_again:
 
     // PAPER LINES 24-25
     const rct::key winv = invert(w[round]);
-    Gprime = hadamard2(vector_scalar2(slice(Gprime, 0, nprime), winv), vector_scalar2(slice(Gprime, nprime, Gprime.size()), w[round]));
-    Hprime = hadamard2(vector_scalar2(slice(Hprime, 0, nprime), w[round]), vector_scalar2(slice(Hprime, nprime, Hprime.size()), winv));
+    if (nprime > 1)
+    {
+      PERF_TIMER_START_BP(PROVE_hadamard2);
+      hadamard_fold(Gprime, winv, w[round]);
+      hadamard_fold(Hprime, w[round], winv);
+      PERF_TIMER_STOP(PROVE_hadamard2);
+    }
 
     // PAPER LINES 28-29
+    PERF_TIMER_START_BP(PROVE_prime);
     aprime = vector_add(vector_scalar(slice(aprime, 0, nprime), w[round]), vector_scalar(slice(aprime, nprime, aprime.size()), winv));
     bprime = vector_add(vector_scalar(slice(bprime, 0, nprime), winv), vector_scalar(slice(bprime, nprime, bprime.size()), w[round]));
+    PERF_TIMER_STOP(PROVE_prime);
 
     ++round;
   }

From a49a17618fcb4c734248dcc06954c86dcc3fc5cd Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Tue, 7 Aug 2018 09:59:14 +0000
Subject: [PATCH 02/16] bulletproofs: shave off a lot of scalar muls from the
 g/h construction

---
 src/ringct/bulletproofs.cc | 59 +++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 0e5b3b55f..8a6ea2dce 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -1138,41 +1138,60 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
       winv[i] = invert(w[i]);
     PERF_TIMER_STOP(VERIFY_line_24_25_invert);
 
+    // precalc
+    PERF_TIMER_START_BP(VERIFY_line_24_25_precalc);
+    rct::keyV w_cache(1<<rounds);
+    w_cache[0] = winv[0];
+    w_cache[1] = w[0];
+    for (size_t j = 1; j < rounds; ++j)
+    {
+      const size_t slots = 1<<(j+1);
+      for (size_t s = slots; s-- > 0; --s)
+      {
+        sc_mul(w_cache[s].bytes, w_cache[s/2].bytes, w[j].bytes);
+        sc_mul(w_cache[s-1].bytes, w_cache[s/2].bytes, winv[j].bytes);
+      }
+    }
+    PERF_TIMER_STOP(VERIFY_line_24_25_precalc);
+
     for (size_t i = 0; i < MN; ++i)
     {
-      // Convert the index to binary IN REVERSE and construct the scalar exponent
       rct::key g_scalar = proof.a;
       rct::key h_scalar;
-      sc_mul(h_scalar.bytes, proof.b.bytes, yinvpow.bytes);
+      if (i == 0)
+        h_scalar = proof.b;
+      else
+        sc_mul(h_scalar.bytes, proof.b.bytes, yinvpow.bytes);
 
-      for (size_t j = rounds; j-- > 0; )
-      {
-        size_t J = w.size() - j - 1;
-
-        if ((i & (((size_t)1)<<j)) == 0)
-        {
-          sc_mul(g_scalar.bytes, g_scalar.bytes, winv[J].bytes);
-          sc_mul(h_scalar.bytes, h_scalar.bytes, w[J].bytes);
-        }
-        else
-        {
-          sc_mul(g_scalar.bytes, g_scalar.bytes, w[J].bytes);
-          sc_mul(h_scalar.bytes, h_scalar.bytes, winv[J].bytes);
-        }
-      }
+      // Convert the index to binary IN REVERSE and construct the scalar exponent
+      sc_mul(g_scalar.bytes, g_scalar.bytes, w_cache[i].bytes);
+      sc_mul(h_scalar.bytes, h_scalar.bytes, w_cache[(~i) & (MN-1)].bytes);
 
       // Adjust the scalars using the exponents from PAPER LINE 62
       sc_add(g_scalar.bytes, g_scalar.bytes, z.bytes);
       CHECK_AND_ASSERT_MES(2+i/N < zpow.size(), false, "invalid zpow index");
       CHECK_AND_ASSERT_MES(i%N < twoN.size(), false, "invalid twoN index");
       sc_mul(tmp.bytes, zpow[2+i/N].bytes, twoN[i%N].bytes);
-      sc_muladd(tmp.bytes, z.bytes, ypow.bytes, tmp.bytes);
-      sc_mulsub(h_scalar.bytes, tmp.bytes, yinvpow.bytes, h_scalar.bytes);
+      if (i == 0)
+      {
+        sc_add(tmp.bytes, tmp.bytes, z.bytes);
+        sc_sub(h_scalar.bytes, h_scalar.bytes, tmp.bytes);
+      }
+      else
+      {
+        sc_muladd(tmp.bytes, z.bytes, ypow.bytes, tmp.bytes);
+        sc_mulsub(h_scalar.bytes, tmp.bytes, yinvpow.bytes, h_scalar.bytes);
+      }
 
       sc_muladd(z4[i].bytes, g_scalar.bytes, weight.bytes, z4[i].bytes);
       sc_muladd(z5[i].bytes, h_scalar.bytes, weight.bytes, z5[i].bytes);
 
-      if (i != MN-1)
+      if (i == 0)
+      {
+        yinvpow = yinv;
+        ypow = y;
+      }
+      else if (i != MN-1)
       {
         sc_mul(yinvpow.bytes, yinvpow.bytes, yinv.bytes);
         sc_mul(ypow.bytes, ypow.bytes, y.bytes);

From a621d6c8b4b44bb3d97eed0069a7a539cab910ce Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Wed, 8 Aug 2018 12:14:13 +0000
Subject: [PATCH 03/16] bulletproofs: random minor speedups

---
 src/ringct/bulletproofs.cc | 78 +++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 8a6ea2dce..2d2e6af36 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -93,18 +93,6 @@ static bool is_reduced(const rct::key &scalar)
   return scalar == reduced;
 }
 
-static void addKeys_acc_p3(ge_p3 *acc_p3, const rct::key &a, const rct::key &point)
-{
-    ge_p3 p3;
-    CHECK_AND_ASSERT_THROW_MES(ge_frombytes_vartime(&p3, point.bytes) == 0, "ge_frombytes_vartime failed");
-    ge_scalarmult_p3(&p3, a.bytes, &p3);
-    ge_cached cached;
-    ge_p3_to_cached(&cached, acc_p3);
-    ge_p1p1 p1;
-    ge_add(&p1, &p3, &cached);
-    ge_p1p1_to_p3(acc_p3, &p1);
-}
-
 static void add_acc_p3(ge_p3 *acc_p3, const rct::key &point)
 {
     ge_p3 p3;
@@ -435,10 +423,13 @@ Bulletproof bulletproof_PROVE(const rct::key &sv, const rct::key &gamma)
 
   rct::key V;
   rct::keyV aL(N), aR(N);
+  rct::key tmp, tmp2;
 
   PERF_TIMER_START_BP(PROVE_v);
-  rct::addKeys2(V, gamma, sv, rct::H);
-  V = rct::scalarmultKey(V, INV_EIGHT);
+  rct::key gamma8, sv8;
+  sc_mul(gamma8.bytes, gamma.bytes, INV_EIGHT.bytes);
+  sc_mul(sv8.bytes, sv.bytes, INV_EIGHT.bytes);
+  rct::addKeys2(V, gamma8, sv8, rct::H);
   PERF_TIMER_STOP(PROVE_v);
 
   PERF_TIMER_START_BP(PROVE_aLaR);
@@ -515,7 +506,6 @@ try_again:
   const auto yN = vector_powers(y, N);
 
   rct::key ip1y = vector_sum(yN);
-  rct::key tmp;
   sc_muladd(t0.bytes, z.bytes, ip1y.bytes, t0.bytes);
 
   rct::key zsq;
@@ -563,10 +553,16 @@ try_again:
   // PAPER LINES 47-48
   rct::key tau1 = rct::skGen(), tau2 = rct::skGen();
 
-  rct::key T1 = rct::addKeys(rct::scalarmultH(t1), rct::scalarmultBase(tau1));
-  T1 = rct::scalarmultKey(T1, INV_EIGHT);
-  rct::key T2 = rct::addKeys(rct::scalarmultH(t2), rct::scalarmultBase(tau2));
-  T2 = rct::scalarmultKey(T2, INV_EIGHT);
+  rct::key T1, T2;
+  ge_p3 p3;
+  sc_mul(tmp.bytes, t1.bytes, INV_EIGHT.bytes);
+  sc_mul(tmp2.bytes, tau1.bytes, INV_EIGHT.bytes);
+  ge_double_scalarmult_base_vartime_p3(&p3, tmp.bytes, &ge_p3_H, tmp2.bytes);
+  ge_p3_tobytes(T1.bytes, &p3);
+  sc_mul(tmp.bytes, t2.bytes, INV_EIGHT.bytes);
+  sc_mul(tmp2.bytes, tau2.bytes, INV_EIGHT.bytes);
+  ge_double_scalarmult_base_vartime_p3(&p3, tmp.bytes, &ge_p3_H, tmp2.bytes);
+  ge_p3_tobytes(T2.bytes, &p3);
 
   // PAPER LINES 49-51
   rct::key x = hash_cache_mash(hash_cache, z, T1, T2);
@@ -671,7 +667,7 @@ try_again:
   PERF_TIMER_STOP(PROVE_step4);
 
   // PAPER LINE 58 (with inclusions from PAPER LINE 8 and PAPER LINE 20)
-  return Bulletproof(V, A, S, T1, T2, taux, mu, L, R, aprime[0], bprime[0], t);
+  return Bulletproof(V, A, S, T1, T2, taux, mu, std::move(L), std::move(R), aprime[0], bprime[0], t);
 }
 
 Bulletproof bulletproof_PROVE(uint64_t v, const rct::key &gamma)
@@ -715,13 +711,15 @@ Bulletproof bulletproof_PROVE(const rct::keyV &sv, const rct::keyV &gamma)
 
   rct::keyV V(sv.size());
   rct::keyV aL(MN), aR(MN);
-  rct::key tmp;
+  rct::key tmp, tmp2;
 
   PERF_TIMER_START_BP(PROVE_v);
   for (size_t i = 0; i < sv.size(); ++i)
   {
-    rct::addKeys2(V[i], gamma[i], sv[i], rct::H);
-    V[i] = rct::scalarmultKey(V[i], INV_EIGHT);
+    rct::key gamma8, sv8;
+    sc_mul(gamma8.bytes, gamma[i].bytes, INV_EIGHT.bytes);
+    sc_mul(sv8.bytes, sv[i].bytes, INV_EIGHT.bytes);
+    rct::addKeys2(V[i], gamma8, sv8, rct::H);
   }
   PERF_TIMER_STOP(PROVE_v);
 
@@ -843,10 +841,16 @@ try_again:
   // PAPER LINES 47-48
   rct::key tau1 = rct::skGen(), tau2 = rct::skGen();
 
-  rct::key T1 = rct::addKeys(rct::scalarmultH(t1), rct::scalarmultBase(tau1));
-  T1 = rct::scalarmultKey(T1, INV_EIGHT);
-  rct::key T2 = rct::addKeys(rct::scalarmultH(t2), rct::scalarmultBase(tau2));
-  T2 = rct::scalarmultKey(T2, INV_EIGHT);
+  rct::key T1, T2;
+  ge_p3 p3;
+  sc_mul(tmp.bytes, t1.bytes, INV_EIGHT.bytes);
+  sc_mul(tmp2.bytes, tau1.bytes, INV_EIGHT.bytes);
+  ge_double_scalarmult_base_vartime_p3(&p3, tmp.bytes, &ge_p3_H, tmp2.bytes);
+  ge_p3_tobytes(T1.bytes, &p3);
+  sc_mul(tmp.bytes, t2.bytes, INV_EIGHT.bytes);
+  sc_mul(tmp2.bytes, tau2.bytes, INV_EIGHT.bytes);
+  ge_double_scalarmult_base_vartime_p3(&p3, tmp.bytes, &ge_p3_H, tmp2.bytes);
+  ge_p3_tobytes(T2.bytes, &p3);
 
   // PAPER LINES 49-51
   rct::key x = hash_cache_mash(hash_cache, z, T1, T2);
@@ -972,7 +976,7 @@ try_again:
   PERF_TIMER_STOP(PROVE_step4);
 
   // PAPER LINE 58 (with inclusions from PAPER LINE 8 and PAPER LINE 20)
-  return Bulletproof(V, A, S, T1, T2, taux, mu, L, R, aprime[0], bprime[0], t);
+  return Bulletproof(std::move(V), A, S, T1, T2, taux, mu, std::move(L), std::move(R), aprime[0], bprime[0], t);
 }
 
 Bulletproof bulletproof_PROVE(const std::vector<uint64_t> &v, const rct::keyV &gamma)
@@ -1034,10 +1038,10 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
   // setup weighted aggregates
   rct::key Z0 = rct::identity();
   rct::key z1 = rct::zero();
-  rct::key Z2 = rct::identity();
+  rct::key &Z2 = Z0;
   rct::key z3 = rct::zero();
   rct::keyV z4(maxMN, rct::zero()), z5(maxMN, rct::zero());
-  rct::key Y2 = rct::identity(), Y3 = rct::identity(), Y4 = rct::identity();
+  rct::key Y2 = rct::identity(), &Y3 = Y2, &Y4 = Y2;
   rct::key y0 = rct::zero(), y1 = rct::zero();
   for (const Bulletproof *p: proofs)
   {
@@ -1224,11 +1228,8 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
   // now check all proofs at once
   PERF_TIMER_START_BP(VERIFY_step2_check);
   ge_p3 check1;
-  ge_scalarmult_base(&check1, y0.bytes);
-  addKeys_acc_p3(&check1, y1, rct::H);
+  ge_double_scalarmult_base_vartime_p3(&check1, y1.bytes, &ge_p3_H, y0.bytes);
   sub_acc_p3(&check1, Y2);
-  sub_acc_p3(&check1, Y3);
-  sub_acc_p3(&check1, Y4);
   if (!ge_p3_is_point_at_infinity(&check1))
   {
     MERROR("Verification failure at step 1");
@@ -1238,18 +1239,15 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
   sc_sub(tmp.bytes, rct::zero().bytes, z1.bytes);
   ge_double_scalarmult_base_vartime_p3(&check2, z3.bytes, &ge_p3_H, tmp.bytes);
   add_acc_p3(&check2, Z0);
-  add_acc_p3(&check2, Z2);
 
   std::vector<MultiexpData> multiexp_data;
   multiexp_data.reserve(2 * maxMN);
   for (size_t i = 0; i < maxMN; ++i)
   {
-    sc_sub(tmp.bytes, rct::zero().bytes, z4[i].bytes);
-    multiexp_data.emplace_back(tmp, Gi_p3[i]);
-    sc_sub(tmp.bytes, rct::zero().bytes, z5[i].bytes);
-    multiexp_data.emplace_back(tmp, Hi_p3[i]);
+    multiexp_data.emplace_back(z4[i], Gi_p3[i]);
+    multiexp_data.emplace_back(z5[i], Hi_p3[i]);
   }
-  add_acc_p3(&check2, multiexp(multiexp_data, true));
+  sub_acc_p3(&check2, multiexp(multiexp_data, true));
   PERF_TIMER_STOP(VERIFY_step2_check);
 
   if (!ge_p3_is_point_at_infinity(&check2))

From 484155d043d661ae1a3d7a196b2354719b90485a Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Wed, 8 Aug 2018 15:01:41 +0000
Subject: [PATCH 04/16] bulletproofs: some more speedup

---
 src/ringct/bulletproofs.cc | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 2d2e6af36..5c75e6418 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -423,6 +423,7 @@ Bulletproof bulletproof_PROVE(const rct::key &sv, const rct::key &gamma)
 
   rct::key V;
   rct::keyV aL(N), aR(N);
+  rct::keyV aL8(N), aR8(N);
   rct::key tmp, tmp2;
 
   PERF_TIMER_START_BP(PROVE_v);
@@ -438,12 +439,15 @@ Bulletproof bulletproof_PROVE(const rct::key &sv, const rct::key &gamma)
     if (sv[i/8] & (((uint64_t)1)<<(i%8)))
     {
       aL[i] = rct::identity();
+      aL8[i] = INV_EIGHT;
+      aR[i] = aR8[i] = rct::zero();
     }
     else
     {
-      aL[i] = rct::zero();
+      aL[i] = aL8[i] = rct::zero();
+      aR[i] = MINUS_ONE;
+      aR8[i] = MINUS_INV_EIGHT;
     }
-    sc_sub(aR[i].bytes, aL[i].bytes, rct::identity().bytes);
   }
   PERF_TIMER_STOP(PROVE_aLaR);
 
@@ -469,10 +473,10 @@ try_again:
   PERF_TIMER_START_BP(PROVE_step1);
   // PAPER LINES 38-39
   rct::key alpha = rct::skGen();
-  rct::key ve = vector_exponent(aL, aR);
+  rct::key ve = vector_exponent(aL8, aR8);
   rct::key A;
-  rct::addKeys(A, ve, rct::scalarmultBase(alpha));
-  A = rct::scalarmultKey(A, INV_EIGHT);
+  sc_mul(tmp.bytes, alpha.bytes, INV_EIGHT.bytes);
+  rct::addKeys(A, ve, rct::scalarmultBase(tmp));
 
   // PAPER LINES 40-42
   rct::keyV sL = rct::skvGen(N), sR = rct::skvGen(N);
@@ -711,6 +715,7 @@ Bulletproof bulletproof_PROVE(const rct::keyV &sv, const rct::keyV &gamma)
 
   rct::keyV V(sv.size());
   rct::keyV aL(MN), aR(MN);
+  rct::keyV aL8(MN), aR8(MN);
   rct::key tmp, tmp2;
 
   PERF_TIMER_START_BP(PROVE_v);
@@ -728,19 +733,18 @@ Bulletproof bulletproof_PROVE(const rct::keyV &sv, const rct::keyV &gamma)
   {
     for (size_t i = N; i-- > 0; )
     {
-      if (j >= sv.size())
-      {
-        aL[j*N+i] = rct::zero();
-      }
-      else if (sv[j][i/8] & (((uint64_t)1)<<(i%8)))
+      if (j < sv.size() && (sv[j][i/8] & (((uint64_t)1)<<(i%8))))
       {
         aL[j*N+i] = rct::identity();
+        aL8[j*N+i] = INV_EIGHT;
+        aR[j*N+i] = aR8[j*N+i] = rct::zero();
       }
       else
       {
-        aL[j*N+i] = rct::zero();
+        aL[j*N+i] = aL8[j*N+i] = rct::zero();
+        aR[j*N+i] = MINUS_ONE;
+        aR8[j*N+i] = MINUS_INV_EIGHT;
       }
-      sc_sub(aR[j*N+i].bytes, aL[j*N+i].bytes, rct::identity().bytes);
     }
   }
   PERF_TIMER_STOP(PROVE_aLaR);
@@ -771,10 +775,10 @@ try_again:
   PERF_TIMER_START_BP(PROVE_step1);
   // PAPER LINES 38-39
   rct::key alpha = rct::skGen();
-  rct::key ve = vector_exponent(aL, aR);
+  rct::key ve = vector_exponent(aL8, aR8);
   rct::key A;
-  rct::addKeys(A, ve, rct::scalarmultBase(alpha));
-  A = rct::scalarmultKey(A, INV_EIGHT);
+  sc_mul(tmp.bytes, alpha.bytes, INV_EIGHT.bytes);
+  rct::addKeys(A, ve, rct::scalarmultBase(tmp));
 
   // PAPER LINES 40-42
   rct::keyV sL = rct::skvGen(MN), sR = rct::skvGen(MN);

From a281b950bff73fc715554d00ec32292ef97b56ec Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Wed, 8 Aug 2018 18:39:31 +0000
Subject: [PATCH 05/16] bulletproofs: remove single value prover

It is now expressed in terms of the array prover
---
 src/ringct/bulletproofs.cc | 284 +------------------------------------
 src/ringct/rctSigs.cpp     |   9 --
 2 files changed, 2 insertions(+), 291 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 5c75e6418..09d22c6d1 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -313,17 +313,6 @@ static rct::keyV vector_dup(const rct::key &x, size_t N)
   return rct::keyV(N, x);
 }
 
-/* Get the sum of a vector's elements */
-static rct::key vector_sum(const rct::keyV &a)
-{
-  rct::key res = rct::zero();
-  for (size_t i = 0; i < a.size(); ++i)
-  {
-    sc_add(res.bytes, res.bytes, a[i].bytes);
-  }
-  return res;
-}
-
 static rct::key switch_endianness(rct::key k)
 {
   std::reverse(k.bytes, k.bytes + sizeof(k));
@@ -414,281 +403,12 @@ static rct::key hash_cache_mash(rct::key &hash_cache, const rct::key &mash0, con
 /* Given a value v (0..2^N-1) and a mask gamma, construct a range proof */
 Bulletproof bulletproof_PROVE(const rct::key &sv, const rct::key &gamma)
 {
-  init_exponents();
-
-  PERF_TIMER_UNIT(PROVE, 1000000);
-
-  constexpr size_t logN = 6; // log2(64)
-  constexpr size_t N = 1<<logN;
-
-  rct::key V;
-  rct::keyV aL(N), aR(N);
-  rct::keyV aL8(N), aR8(N);
-  rct::key tmp, tmp2;
-
-  PERF_TIMER_START_BP(PROVE_v);
-  rct::key gamma8, sv8;
-  sc_mul(gamma8.bytes, gamma.bytes, INV_EIGHT.bytes);
-  sc_mul(sv8.bytes, sv.bytes, INV_EIGHT.bytes);
-  rct::addKeys2(V, gamma8, sv8, rct::H);
-  PERF_TIMER_STOP(PROVE_v);
-
-  PERF_TIMER_START_BP(PROVE_aLaR);
-  for (size_t i = N; i-- > 0; )
-  {
-    if (sv[i/8] & (((uint64_t)1)<<(i%8)))
-    {
-      aL[i] = rct::identity();
-      aL8[i] = INV_EIGHT;
-      aR[i] = aR8[i] = rct::zero();
-    }
-    else
-    {
-      aL[i] = aL8[i] = rct::zero();
-      aR[i] = MINUS_ONE;
-      aR8[i] = MINUS_INV_EIGHT;
-    }
-  }
-  PERF_TIMER_STOP(PROVE_aLaR);
-
-  rct::key hash_cache = rct::hash_to_scalar(V);
-
-  // DEBUG: Test to ensure this recovers the value
-#ifdef DEBUG_BP
-  uint64_t test_aL = 0, test_aR = 0;
-  for (size_t i = 0; i < N; ++i)
-  {
-    if (aL[i] == rct::identity())
-      test_aL += ((uint64_t)1)<<i;
-    if (aR[i] == rct::zero())
-      test_aR += ((uint64_t)1)<<i;
-  }
-  uint64_t v_test = 0;
-  for (int n = 0; n < 8; ++n) v_test |= (((uint64_t)sv[n]) << (8*n));
-  CHECK_AND_ASSERT_THROW_MES(test_aL == v_test, "test_aL failed");
-  CHECK_AND_ASSERT_THROW_MES(test_aR == v_test, "test_aR failed");
-#endif
-
-try_again:
-  PERF_TIMER_START_BP(PROVE_step1);
-  // PAPER LINES 38-39
-  rct::key alpha = rct::skGen();
-  rct::key ve = vector_exponent(aL8, aR8);
-  rct::key A;
-  sc_mul(tmp.bytes, alpha.bytes, INV_EIGHT.bytes);
-  rct::addKeys(A, ve, rct::scalarmultBase(tmp));
-
-  // PAPER LINES 40-42
-  rct::keyV sL = rct::skvGen(N), sR = rct::skvGen(N);
-  rct::key rho = rct::skGen();
-  ve = vector_exponent(sL, sR);
-  rct::key S;
-  rct::addKeys(S, ve, rct::scalarmultBase(rho));
-  S = rct::scalarmultKey(S, INV_EIGHT);
-
-  // PAPER LINES 43-45
-  rct::key y = hash_cache_mash(hash_cache, A, S);
-  if (y == rct::zero())
-  {
-    PERF_TIMER_STOP(PROVE_step1);
-    MINFO("y is 0, trying again");
-    goto try_again;
-  }
-  rct::key z = hash_cache = rct::hash_to_scalar(y);
-  if (z == rct::zero())
-  {
-    PERF_TIMER_STOP(PROVE_step1);
-    MINFO("z is 0, trying again");
-    goto try_again;
-  }
-
-  // Polynomial construction before PAPER LINE 46
-  rct::key t0 = rct::zero();
-  rct::key t1 = rct::zero();
-  rct::key t2 = rct::zero();
-
-  const auto yN = vector_powers(y, N);
-
-  rct::key ip1y = vector_sum(yN);
-  sc_muladd(t0.bytes, z.bytes, ip1y.bytes, t0.bytes);
-
-  rct::key zsq;
-  sc_mul(zsq.bytes, z.bytes, z.bytes);
-  sc_muladd(t0.bytes, zsq.bytes, sv.bytes, t0.bytes);
-
-  rct::key k = rct::zero();
-  sc_mulsub(k.bytes, zsq.bytes, ip1y.bytes, k.bytes);
-
-  rct::key zcu;
-  sc_mul(zcu.bytes, zsq.bytes, z.bytes);
-  sc_mulsub(k.bytes, zcu.bytes, ip12.bytes, k.bytes);
-  sc_add(t0.bytes, t0.bytes, k.bytes);
-
-  // DEBUG: Test the value of t0 has the correct form
-#ifdef DEBUG_BP
-  rct::key test_t0 = rct::zero();
-  rct::key iph = inner_product(aL, hadamard(aR, yN));
-  sc_add(test_t0.bytes, test_t0.bytes, iph.bytes);
-  rct::key ips = inner_product(vector_subtract(aL, aR), yN);
-  sc_muladd(test_t0.bytes, z.bytes, ips.bytes, test_t0.bytes);
-  rct::key ipt = inner_product(twoN, aL);
-  sc_muladd(test_t0.bytes, zsq.bytes, ipt.bytes, test_t0.bytes);
-  sc_add(test_t0.bytes, test_t0.bytes, k.bytes);
-  CHECK_AND_ASSERT_THROW_MES(t0 == test_t0, "t0 check failed");
-#endif
-  PERF_TIMER_STOP(PROVE_step1);
-
-  PERF_TIMER_START_BP(PROVE_step2);
-  const auto HyNsR = hadamard(yN, sR);
-  const auto vpIz = vector_dup(z, N);
-  const auto vp2zsq = vector_scalar(twoN, zsq);
-  const auto aL_vpIz = vector_subtract(aL, vpIz);
-  const auto aR_vpIz = vector_add(aR, vpIz);
-
-  rct::key ip1 = inner_product(aL_vpIz, HyNsR);
-  sc_add(t1.bytes, t1.bytes, ip1.bytes);
-
-  rct::key ip2 = inner_product(sL, vector_add(hadamard(yN, aR_vpIz), vp2zsq));
-  sc_add(t1.bytes, t1.bytes, ip2.bytes);
-
-  rct::key ip3 = inner_product(sL, HyNsR);
-  sc_add(t2.bytes, t2.bytes, ip3.bytes);
-
-  // PAPER LINES 47-48
-  rct::key tau1 = rct::skGen(), tau2 = rct::skGen();
-
-  rct::key T1, T2;
-  ge_p3 p3;
-  sc_mul(tmp.bytes, t1.bytes, INV_EIGHT.bytes);
-  sc_mul(tmp2.bytes, tau1.bytes, INV_EIGHT.bytes);
-  ge_double_scalarmult_base_vartime_p3(&p3, tmp.bytes, &ge_p3_H, tmp2.bytes);
-  ge_p3_tobytes(T1.bytes, &p3);
-  sc_mul(tmp.bytes, t2.bytes, INV_EIGHT.bytes);
-  sc_mul(tmp2.bytes, tau2.bytes, INV_EIGHT.bytes);
-  ge_double_scalarmult_base_vartime_p3(&p3, tmp.bytes, &ge_p3_H, tmp2.bytes);
-  ge_p3_tobytes(T2.bytes, &p3);
-
-  // PAPER LINES 49-51
-  rct::key x = hash_cache_mash(hash_cache, z, T1, T2);
-  if (x == rct::zero())
-  {
-    PERF_TIMER_STOP(PROVE_step2);
-    MINFO("x is 0, trying again");
-    goto try_again;
-  }
-
-  // PAPER LINES 52-53
-  rct::key taux = rct::zero();
-  sc_mul(taux.bytes, tau1.bytes, x.bytes);
-  rct::key xsq;
-  sc_mul(xsq.bytes, x.bytes, x.bytes);
-  sc_muladd(taux.bytes, tau2.bytes, xsq.bytes, taux.bytes);
-  sc_muladd(taux.bytes, gamma.bytes, zsq.bytes, taux.bytes);
-  rct::key mu;
-  sc_muladd(mu.bytes, x.bytes, rho.bytes, alpha.bytes);
-
-  // PAPER LINES 54-57
-  rct::keyV l = vector_add(aL_vpIz, vector_scalar(sL, x));
-  rct::keyV r = vector_add(hadamard(yN, vector_add(aR_vpIz, vector_scalar(sR, x))), vp2zsq);
-  PERF_TIMER_STOP(PROVE_step2);
-
-  PERF_TIMER_START_BP(PROVE_step3);
-  rct::key t = inner_product(l, r);
-
-  // DEBUG: Test if the l and r vectors match the polynomial forms
-#ifdef DEBUG_BP
-  rct::key test_t;
-  sc_muladd(test_t.bytes, t1.bytes, x.bytes, t0.bytes);
-  sc_muladd(test_t.bytes, t2.bytes, xsq.bytes, test_t.bytes);
-  CHECK_AND_ASSERT_THROW_MES(test_t == t, "test_t check failed");
-#endif
-
-  // PAPER LINES 32-33
-  rct::key x_ip = hash_cache_mash(hash_cache, x, taux, mu, t);
-
-  // These are used in the inner product rounds
-  size_t nprime = N;
-  std::vector<ge_p3> Gprime(N);
-  std::vector<ge_p3> Hprime(N);
-  rct::keyV aprime(N);
-  rct::keyV bprime(N);
-  const rct::key yinv = invert(y);
-  rct::key yinvpow = rct::identity();
-  for (size_t i = 0; i < N; ++i)
-  {
-    Gprime[i] = Gi_p3[i];
-    ge_scalarmult_p3(&Hprime[i], yinvpow.bytes, &Hi_p3[i]);
-    sc_mul(yinvpow.bytes, yinvpow.bytes, yinv.bytes);
-    aprime[i] = l[i];
-    bprime[i] = r[i];
-  }
-  rct::keyV L(logN);
-  rct::keyV R(logN);
-  int round = 0;
-  rct::keyV w(logN); // this is the challenge x in the inner product protocol
-  PERF_TIMER_STOP(PROVE_step3);
-
-  PERF_TIMER_START_BP(PROVE_step4);
-  // PAPER LINE 13
-  while (nprime > 1)
-  {
-    // PAPER LINE 15
-    nprime /= 2;
-
-    // PAPER LINES 16-17
-    rct::key cL = inner_product(slice(aprime, 0, nprime), slice(bprime, nprime, bprime.size()));
-    rct::key cR = inner_product(slice(aprime, nprime, aprime.size()), slice(bprime, 0, nprime));
-
-    // PAPER LINES 18-19
-    sc_mul(tmp.bytes, cL.bytes, x_ip.bytes);
-    L[round] = cross_vector_exponent8(nprime, Gprime, nprime, Hprime, 0, aprime, 0, bprime, nprime, &ge_p3_H, &tmp);
-    sc_mul(tmp.bytes, cR.bytes, x_ip.bytes);
-    R[round] = cross_vector_exponent8(nprime, Gprime, 0, Hprime, nprime, aprime, nprime, bprime, 0, &ge_p3_H, &tmp);
-
-    // PAPER LINES 21-22
-    w[round] = hash_cache_mash(hash_cache, L[round], R[round]);
-    if (w[round] == rct::zero())
-    {
-      PERF_TIMER_STOP(PROVE_step4);
-      MINFO("w[round] is 0, trying again");
-      goto try_again;
-    }
-
-    // PAPER LINES 24-25
-    const rct::key winv = invert(w[round]);
-    if (nprime > 1)
-    {
-      hadamard_fold(Gprime, winv, w[round]);
-      hadamard_fold(Hprime, w[round], winv);
-    }
-
-    // PAPER LINES 28-29
-    aprime = vector_add(vector_scalar(slice(aprime, 0, nprime), w[round]), vector_scalar(slice(aprime, nprime, aprime.size()), winv));
-    bprime = vector_add(vector_scalar(slice(bprime, 0, nprime), winv), vector_scalar(slice(bprime, nprime, bprime.size()), w[round]));
-
-    ++round;
-  }
-  PERF_TIMER_STOP(PROVE_step4);
-
-  // PAPER LINE 58 (with inclusions from PAPER LINE 8 and PAPER LINE 20)
-  return Bulletproof(V, A, S, T1, T2, taux, mu, std::move(L), std::move(R), aprime[0], bprime[0], t);
+  return bulletproof_PROVE(rct::keyV(1, sv), rct::keyV(1, gamma));
 }
 
 Bulletproof bulletproof_PROVE(uint64_t v, const rct::key &gamma)
 {
-  // vG + gammaH
-  PERF_TIMER_START_BP(PROVE_v);
-  rct::key sv = rct::zero();
-  sv.bytes[0] = v & 255;
-  sv.bytes[1] = (v >> 8) & 255;
-  sv.bytes[2] = (v >> 16) & 255;
-  sv.bytes[3] = (v >> 24) & 255;
-  sv.bytes[4] = (v >> 32) & 255;
-  sv.bytes[5] = (v >> 40) & 255;
-  sv.bytes[6] = (v >> 48) & 255;
-  sv.bytes[7] = (v >> 56) & 255;
-  PERF_TIMER_STOP(PROVE_v);
-  return bulletproof_PROVE(sv, gamma);
+  return bulletproof_PROVE(std::vector<uint64_t>(1, v), rct::keyV(1, gamma));
 }
 
 /* Given a set of values v (0..2^N-1) and masks gamma, construct a range proof */
diff --git a/src/ringct/rctSigs.cpp b/src/ringct/rctSigs.cpp
index 0d1789a38..5ec9a1750 100644
--- a/src/ringct/rctSigs.cpp
+++ b/src/ringct/rctSigs.cpp
@@ -45,15 +45,6 @@ using namespace std;
 #define CHECK_AND_ASSERT_MES_L1(expr, ret, message) {if(!(expr)) {MCERROR("verify", message); return ret;}}
 
 namespace rct {
-    Bulletproof proveRangeBulletproof(key &C, key &mask, uint64_t amount)
-    {
-        mask = rct::skGen();
-        Bulletproof proof = bulletproof_PROVE(amount, mask);
-        CHECK_AND_ASSERT_THROW_MES(proof.V.size() == 1, "V has not exactly one element");
-        C = proof.V[0];
-        return proof;
-    }
-
     Bulletproof proveRangeBulletproof(keyV &C, keyV &masks, const std::vector<uint64_t> &amounts)
     {
         masks = rct::skvGen(amounts.size());

From c415df97bd20b2bcc999c0c0056bb32e9b102a5b Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Wed, 8 Aug 2018 21:19:57 +0000
Subject: [PATCH 06/16] performance_tests: sc_check and ge_dsm_precomp

---
 tests/performance_tests/CMakeLists.txt |  2 +
 tests/performance_tests/crypto_ops.h   |  3 ++
 tests/performance_tests/main.cpp       |  3 ++
 tests/performance_tests/sc_check.h     | 52 ++++++++++++++++++++++++++
 4 files changed, 60 insertions(+)
 create mode 100644 tests/performance_tests/sc_check.h

diff --git a/tests/performance_tests/CMakeLists.txt b/tests/performance_tests/CMakeLists.txt
index 837d39bd3..5cd054d86 100644
--- a/tests/performance_tests/CMakeLists.txt
+++ b/tests/performance_tests/CMakeLists.txt
@@ -46,6 +46,8 @@ set(performance_tests_headers
   range_proof.h
   bulletproof.h
   crypto_ops.h
+  sc_reduce32.h
+  sc_check.h
   multiexp.h
   multi_tx_test_base.h
   performance_tests.h
diff --git a/tests/performance_tests/crypto_ops.h b/tests/performance_tests/crypto_ops.h
index 3c68583c5..4766a1205 100644
--- a/tests/performance_tests/crypto_ops.h
+++ b/tests/performance_tests/crypto_ops.h
@@ -47,6 +47,7 @@ enum test_op
   op_scalarmultKey,
   op_scalarmultH,
   op_scalarmult8,
+  op_ge_dsm_precomp,
   op_ge_double_scalarmult_base_vartime,
   op_ge_double_scalarmult_precomp_vartime,
   op_ge_double_scalarmult_precomp_vartime2,
@@ -84,6 +85,7 @@ public:
     ge_cached tmp_cached;
     ge_p1p1 tmp_p1p1;
     ge_p2 tmp_p2;
+    ge_dsmp dsmp;
     switch (op)
     {
       case op_sc_add: sc_add(key.bytes, scalar0.bytes, scalar1.bytes); break;
@@ -101,6 +103,7 @@ public:
       case op_scalarmultKey: rct::scalarmultKey(point0, scalar0); break;
       case op_scalarmultH: rct::scalarmultH(scalar0); break;
       case op_scalarmult8: rct::scalarmult8(point0); break;
+      case op_ge_dsm_precomp: ge_dsm_precomp(dsmp, &p3_0); break;
       case op_ge_double_scalarmult_base_vartime: ge_double_scalarmult_base_vartime(&tmp_p2, scalar0.bytes, &p3_0, scalar1.bytes); break;
       case op_ge_double_scalarmult_precomp_vartime: ge_double_scalarmult_precomp_vartime(&tmp_p2, scalar0.bytes, &p3_0, scalar1.bytes, precomp0); break;
       case op_ge_double_scalarmult_precomp_vartime2: ge_double_scalarmult_precomp_vartime2(&tmp_p2, scalar0.bytes, precomp0, scalar1.bytes, precomp1); break;
diff --git a/tests/performance_tests/main.cpp b/tests/performance_tests/main.cpp
index 87a1573c2..3765d1249 100644
--- a/tests/performance_tests/main.cpp
+++ b/tests/performance_tests/main.cpp
@@ -50,6 +50,7 @@
 #include "is_out_to_acc.h"
 #include "subaddress_expand.h"
 #include "sc_reduce32.h"
+#include "sc_check.h"
 #include "cn_fast_hash.h"
 #include "rct_mlsag.h"
 #include "equality.h"
@@ -184,6 +185,7 @@ int main(int argc, char** argv)
   TEST_PERFORMANCE0(filter, p, test_ge_frombytes_vartime);
   TEST_PERFORMANCE0(filter, p, test_generate_keypair);
   TEST_PERFORMANCE0(filter, p, test_sc_reduce32);
+  TEST_PERFORMANCE0(filter, p, test_sc_check);
   TEST_PERFORMANCE1(filter, p, test_signature, false);
   TEST_PERFORMANCE1(filter, p, test_signature, true);
 
@@ -249,6 +251,7 @@ int main(int argc, char** argv)
   TEST_PERFORMANCE1(filter, p, test_crypto_ops, op_scalarmultKey);
   TEST_PERFORMANCE1(filter, p, test_crypto_ops, op_scalarmultH);
   TEST_PERFORMANCE1(filter, p, test_crypto_ops, op_scalarmult8);
+  TEST_PERFORMANCE1(filter, p, test_crypto_ops, op_ge_dsm_precomp);
   TEST_PERFORMANCE1(filter, p, test_crypto_ops, op_ge_double_scalarmult_base_vartime);
   TEST_PERFORMANCE1(filter, p, test_crypto_ops, op_ge_double_scalarmult_precomp_vartime);
   TEST_PERFORMANCE1(filter, p, test_crypto_ops, op_ge_double_scalarmult_precomp_vartime2);
diff --git a/tests/performance_tests/sc_check.h b/tests/performance_tests/sc_check.h
new file mode 100644
index 000000000..036abf12d
--- /dev/null
+++ b/tests/performance_tests/sc_check.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2018, The Monero Project
+// 
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without modification, are
+// permitted provided that the following conditions are met:
+// 
+// 1. Redistributions of source code must retain the above copyright notice, this list of
+//    conditions and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list
+//    of conditions and the following disclaimer in the documentation and/or other
+//    materials provided with the distribution.
+// 
+// 3. Neither the name of the copyright holder nor the names of its contributors may be
+//    used to endorse or promote products derived from this software without specific
+//    prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "crypto/crypto.h"
+
+class test_sc_check
+{
+public:
+  static const size_t loop_count = 10000000;
+
+  bool init()
+  {
+    m_scalar = crypto::rand<crypto::ec_scalar>();
+    return true;
+  }
+
+  bool test()
+  {
+    sc_check((unsigned char*)m_scalar.data);
+    return true;
+  }
+
+private:
+  crypto::ec_scalar m_scalar;
+};

From bf8e4b98709ffcf133ea875f3fbf7f99c28e52db Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Wed, 8 Aug 2018 21:20:50 +0000
Subject: [PATCH 07/16] bulletproofs: some more minor speedup

---
 src/ringct/bulletproofs.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 09d22c6d1..f22a109e9 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -86,11 +86,9 @@ static inline rct::key multiexp(const std::vector<MultiexpData> &data, bool HiGi
     return data.size() <= 64 ? straus(data, NULL, 0) : pippenger(data, NULL, get_pippenger_c(data.size()));
 }
 
-static bool is_reduced(const rct::key &scalar)
+static inline bool is_reduced(const rct::key &scalar)
 {
-  rct::key reduced = scalar;
-  sc_reduce32(reduced.bytes);
-  return scalar == reduced;
+  return sc_check(scalar.bytes) == 0;
 }
 
 static void add_acc_p3(ge_p3 *acc_p3, const rct::key &point)
@@ -139,8 +137,8 @@ static void init_exponents()
     Gi[i] = get_exponent(rct::H, i * 2 + 1);
     CHECK_AND_ASSERT_THROW_MES(ge_frombytes_vartime(&Gi_p3[i], Gi[i].bytes) == 0, "ge_frombytes_vartime failed");
 
-    data.push_back({rct::zero(), Gi[i]});
-    data.push_back({rct::zero(), Hi[i]});
+    data.push_back({rct::zero(), Gi_p3[i]});
+    data.push_back({rct::zero(), Hi_p3[i]});
   }
 
   straus_HiGi_cache = straus_init_cache(data, STRAUS_SIZE_LIMIT);

From 4061960a16d27f803ca54b8887f46185c29c336e Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Mon, 13 Aug 2018 11:18:54 +0000
Subject: [PATCH 08/16] multiexp: pack the digits table when STRAUS_C is 4

Spotted by stoffu
---
 src/ringct/multiexp.cc | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/ringct/multiexp.cc b/src/ringct/multiexp.cc
index 21957b94c..fb2f18551 100644
--- a/src/ringct/multiexp.cc
+++ b/src/ringct/multiexp.cc
@@ -320,7 +320,7 @@ rct::key bos_coster_heap_conv_robust(std::vector<MultiexpData> data)
   return res;
 }
 
-static constexpr unsigned int STRAUS_C = 4;
+#define STRAUS_C 4
 
 struct straus_cached_data
 {
@@ -447,26 +447,23 @@ rct::key straus(const std::vector<MultiexpData> &data, const std::shared_ptr<str
 #endif
 
   MULTIEXP_PERF(PERF_TIMER_START_UNIT(digits, 1000000));
+#if STRAUS_C==4
+  std::unique_ptr<uint8_t[]> digits{new uint8_t[64 * data.size()]};
+#else
   std::unique_ptr<uint8_t[]> digits{new uint8_t[256 * data.size()]};
+#endif
   for (size_t j = 0; j < data.size(); ++j)
   {
     unsigned char bytes33[33];
     memcpy(bytes33,  data[j].scalar.bytes, 32);
     bytes33[32] = 0;
     const unsigned char *bytes = bytes33;
-#if 1
-    static_assert(STRAUS_C == 4, "optimized version needs STRAUS_C == 4");
+#if STRAUS_C==4
     unsigned int i;
-    for (i = 0; i < 256; i += 8, bytes++)
+    for (i = 0; i < 64; i += 2, bytes++)
     {
-      digits[j*256+i] = bytes[0] & 0xf;
-      digits[j*256+i+1] = (bytes[0] >> 1) & 0xf;
-      digits[j*256+i+2] = (bytes[0] >> 2) & 0xf;
-      digits[j*256+i+3] = (bytes[0] >> 3) & 0xf;
-      digits[j*256+i+4] = ((bytes[0] >> 4) | (bytes[1]<<4)) & 0xf;
-      digits[j*256+i+5] = ((bytes[0] >> 5) | (bytes[1]<<3)) & 0xf;
-      digits[j*256+i+6] = ((bytes[0] >> 6) | (bytes[1]<<2)) & 0xf;
-      digits[j*256+i+7] = ((bytes[0] >> 7) | (bytes[1]<<1)) & 0xf;
+      digits[j*64+i] = bytes[0] & 0xf;
+      digits[j*64+i+1] = bytes[0] >> 4;
     }
 #elif 1
     for (size_t i = 0; i < 256; ++i)
@@ -521,7 +518,11 @@ skipfirst:
         if (skip[j])
           continue;
 #endif
+#if STRAUS_C==4
+        const uint8_t digit = digits[j*64+i/4];
+#else
         const uint8_t digit = digits[j*256+i];
+#endif
         if (digit)
         {
           ge_add(&p1, &band_p3, &CACHE_OFFSET(local_cache, j, digit));

From fc9f7d9c81acfe19fdf1285e690d4dfe5ee9d172 Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Thu, 23 Aug 2018 18:52:05 +0000
Subject: [PATCH 09/16] bulletproofs: merge multiexps as per sarang's new
 python code

---
 src/ringct/bulletproofs.cc | 106 +++++++++++++------------------------
 1 file changed, 38 insertions(+), 68 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index f22a109e9..549e52296 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -91,28 +91,6 @@ static inline bool is_reduced(const rct::key &scalar)
   return sc_check(scalar.bytes) == 0;
 }
 
-static void add_acc_p3(ge_p3 *acc_p3, const rct::key &point)
-{
-    ge_p3 p3;
-    CHECK_AND_ASSERT_THROW_MES(ge_frombytes_vartime(&p3, point.bytes) == 0, "ge_frombytes_vartime failed");
-    ge_cached cached;
-    ge_p3_to_cached(&cached, &p3);
-    ge_p1p1 p1;
-    ge_add(&p1, acc_p3, &cached);
-    ge_p1p1_to_p3(acc_p3, &p1);
-}
-
-static void sub_acc_p3(ge_p3 *acc_p3, const rct::key &point)
-{
-    ge_p3 p3;
-    CHECK_AND_ASSERT_THROW_MES(ge_frombytes_vartime(&p3, point.bytes) == 0, "ge_frombytes_vartime failed");
-    ge_cached cached;
-    ge_p3_to_cached(&cached, &p3);
-    ge_p1p1 p1;
-    ge_sub(&p1, acc_p3, &cached);
-    ge_p1p1_to_p3(acc_p3, &p1);
-}
-
 static rct::key get_exponent(const rct::key &base, size_t idx)
 {
   static const std::string salt("bulletproof");
@@ -733,6 +711,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
   // sanity and figure out which proof is longest
   size_t max_length = 0;
+  size_t nV = 0;
   for (const Bulletproof *p: proofs)
   {
     const Bulletproof &proof = *p;
@@ -749,6 +728,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     CHECK_AND_ASSERT_MES(proof.L.size() > 0, false, "Empty proof");
 
     max_length = std::max(max_length, proof.L.size());
+    nV += proof.V.size();
   }
   CHECK_AND_ASSERT_MES(max_length < 32, false, "At least one proof is too large");
   size_t maxMN = 1u << max_length;
@@ -757,13 +737,13 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
   const size_t N = 1 << logN;
   rct::key tmp;
 
+  std::vector<MultiexpData> multiexp_data;
+  multiexp_data.reserve(nV + (2 * (10/*logM*/ + logN) + 4) * proofs.size() + 2 * maxMN);
+
   // setup weighted aggregates
-  rct::key Z0 = rct::identity();
   rct::key z1 = rct::zero();
-  rct::key &Z2 = Z0;
   rct::key z3 = rct::zero();
   rct::keyV z4(maxMN, rct::zero()), z5(maxMN, rct::zero());
-  rct::key Y2 = rct::identity(), &Y3 = Y2, &Y4 = Y2;
   rct::key y0 = rct::zero(), y1 = rct::zero();
   for (const Bulletproof *p: proofs)
   {
@@ -773,7 +753,8 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     for (logM = 0; (M = 1<<logM) <= maxM && M < proof.V.size(); ++logM);
     CHECK_AND_ASSERT_MES(proof.L.size() == 6+logM, false, "Proof is not the expected size");
     const size_t MN = M*N;
-    rct::key weight = rct::skGen();
+    const rct::key weight_y = rct::skGen();
+    const rct::key weight_z = rct::skGen();
 
     // Reconstruct the challenges
     PERF_TIMER_START_BP(VERIFY_start);
@@ -795,10 +776,11 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     rct::key proof8_T1 = rct::scalarmult8(proof.T1);
     rct::key proof8_T2 = rct::scalarmult8(proof.T2);
     rct::key proof8_S = rct::scalarmult8(proof.S);
+    rct::key proof8_A = rct::scalarmult8(proof.A);
 
     PERF_TIMER_START_BP(VERIFY_line_61);
     // PAPER LINE 61
-    sc_muladd(y0.bytes, proof.taux.bytes, weight.bytes, y0.bytes);
+    sc_muladd(y0.bytes, proof.taux.bytes, weight_y.bytes, y0.bytes);
 
     const rct::keyV zpow = vector_powers(z, M+3);
 
@@ -814,26 +796,26 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
     PERF_TIMER_START_BP(VERIFY_line_61rl_new);
     sc_muladd(tmp.bytes, z.bytes, ip1y.bytes, k.bytes);
-    std::vector<MultiexpData> multiexp_data;
-    multiexp_data.reserve(proof.V.size());
     sc_sub(tmp.bytes, proof.t.bytes, tmp.bytes);
-    sc_muladd(y1.bytes, tmp.bytes, weight.bytes, y1.bytes);
+    sc_muladd(y1.bytes, tmp.bytes, weight_y.bytes, y1.bytes);
     for (size_t j = 0; j < proof8_V.size(); j++)
     {
-      multiexp_data.emplace_back(zpow[j+2], proof8_V[j]);
+      sc_mul(tmp.bytes, zpow[j+2].bytes, weight_y.bytes);
+      multiexp_data.emplace_back(tmp, proof8_V[j]);
     }
-    rct::addKeys(Y2, Y2, rct::scalarmultKey(multiexp(multiexp_data, false), weight));
-    sc_mul(tmp.bytes, x.bytes, weight.bytes);
-    rct::addKeys(Y3, Y3, rct::scalarmultKey(proof8_T1, tmp));
+    sc_mul(tmp.bytes, x.bytes, weight_y.bytes);
+    multiexp_data.emplace_back(tmp, proof8_T1);
     rct::key xsq;
     sc_mul(xsq.bytes, x.bytes, x.bytes);
-    sc_mul(tmp.bytes, xsq.bytes, weight.bytes);
-    rct::addKeys(Y4, Y4, rct::scalarmultKey(proof8_T2, tmp));
+    sc_mul(tmp.bytes, xsq.bytes, weight_y.bytes);
+    multiexp_data.emplace_back(tmp, proof8_T2);
     PERF_TIMER_STOP(VERIFY_line_61rl_new);
 
     PERF_TIMER_START_BP(VERIFY_line_62);
     // PAPER LINE 62
-    rct::addKeys(Z0, Z0, rct::scalarmultKey(rct::addKeys(rct::scalarmult8(proof.A), rct::scalarmultKey(proof8_S, x)), weight));
+    multiexp_data.emplace_back(weight_z, proof8_A);
+    sc_mul(tmp.bytes, x.bytes, weight_z.bytes);
+    multiexp_data.emplace_back(tmp, proof8_S);
     PERF_TIMER_STOP(VERIFY_line_62);
 
     // Compute the number of rounds for the inner product
@@ -909,8 +891,8 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
         sc_mulsub(h_scalar.bytes, tmp.bytes, yinvpow.bytes, h_scalar.bytes);
       }
 
-      sc_muladd(z4[i].bytes, g_scalar.bytes, weight.bytes, z4[i].bytes);
-      sc_muladd(z5[i].bytes, h_scalar.bytes, weight.bytes, z5[i].bytes);
+      sc_muladd(z4[i].bytes, g_scalar.bytes, weight_z.bytes, z4[i].bytes);
+      sc_muladd(z5[i].bytes, h_scalar.bytes, weight_z.bytes, z5[i].bytes);
 
       if (i == 0)
       {
@@ -928,55 +910,43 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
     // PAPER LINE 26
     PERF_TIMER_START_BP(VERIFY_line_26_new);
-    multiexp_data.clear();
-    multiexp_data.reserve(2*rounds);
-
-    sc_muladd(z1.bytes, proof.mu.bytes, weight.bytes, z1.bytes);
+    sc_muladd(z1.bytes, proof.mu.bytes, weight_z.bytes, z1.bytes);
     for (size_t i = 0; i < rounds; ++i)
     {
       sc_mul(tmp.bytes, w[i].bytes, w[i].bytes);
+      sc_mul(tmp.bytes, tmp.bytes, weight_z.bytes);
       multiexp_data.emplace_back(tmp, proof8_L[i]);
       sc_mul(tmp.bytes, winv[i].bytes, winv[i].bytes);
+      sc_mul(tmp.bytes, tmp.bytes, weight_z.bytes);
       multiexp_data.emplace_back(tmp, proof8_R[i]);
     }
-    rct::key acc = multiexp(multiexp_data, false);
-    rct::addKeys(Z2, Z2, rct::scalarmultKey(acc, weight));
     sc_mulsub(tmp.bytes, proof.a.bytes, proof.b.bytes, proof.t.bytes);
     sc_mul(tmp.bytes, tmp.bytes, x_ip.bytes);
-    sc_muladd(z3.bytes, tmp.bytes, weight.bytes, z3.bytes);
+    sc_muladd(z3.bytes, tmp.bytes, weight_z.bytes, z3.bytes);
     PERF_TIMER_STOP(VERIFY_line_26_new);
   }
 
   // now check all proofs at once
   PERF_TIMER_START_BP(VERIFY_step2_check);
-  ge_p3 check1;
-  ge_double_scalarmult_base_vartime_p3(&check1, y1.bytes, &ge_p3_H, y0.bytes);
-  sub_acc_p3(&check1, Y2);
-  if (!ge_p3_is_point_at_infinity(&check1))
-  {
-    MERROR("Verification failure at step 1");
-    return false;
-  }
-  ge_p3 check2;
-  sc_sub(tmp.bytes, rct::zero().bytes, z1.bytes);
-  ge_double_scalarmult_base_vartime_p3(&check2, z3.bytes, &ge_p3_H, tmp.bytes);
-  add_acc_p3(&check2, Z0);
-
-  std::vector<MultiexpData> multiexp_data;
-  multiexp_data.reserve(2 * maxMN);
+  sc_sub(tmp.bytes, rct::zero().bytes, y0.bytes);
+  sc_sub(tmp.bytes, tmp.bytes, z1.bytes);
+  multiexp_data.emplace_back(tmp, rct::G);
+  sc_sub(tmp.bytes, z3.bytes, y1.bytes);
+  multiexp_data.emplace_back(tmp, rct::H);
   for (size_t i = 0; i < maxMN; ++i)
   {
-    multiexp_data.emplace_back(z4[i], Gi_p3[i]);
-    multiexp_data.emplace_back(z5[i], Hi_p3[i]);
+    sc_sub(tmp.bytes, rct::zero().bytes, z4[i].bytes);
+    multiexp_data.emplace_back(tmp, Gi_p3[i]);
+    sc_sub(tmp.bytes, rct::zero().bytes, z5[i].bytes);
+    multiexp_data.emplace_back(tmp, Hi_p3[i]);
   }
-  sub_acc_p3(&check2, multiexp(multiexp_data, true));
-  PERF_TIMER_STOP(VERIFY_step2_check);
-
-  if (!ge_p3_is_point_at_infinity(&check2))
+  if (!(multiexp(multiexp_data, false) == rct::identity()))
   {
-    MERROR("Verification failure at step 2");
+    PERF_TIMER_STOP(VERIFY_step2_check);
+    MERROR("Verification failure");
     return false;
   }
+  PERF_TIMER_STOP(VERIFY_step2_check);
 
   PERF_TIMER_STOP(VERIFY);
   return true;

From 8629a42cf6e4650b552925f7637761b8e7ee66e3 Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Wed, 22 Aug 2018 22:30:14 +0000
Subject: [PATCH 10/16] bulletproofs: rework flow to use sarang's fast batch
 inversion code

---
 src/ringct/bulletproofs.cc | 231 ++++++++++++++++++++++++-------------
 1 file changed, 154 insertions(+), 77 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 549e52296..d9961cb20 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -29,8 +29,6 @@
 // Adapted from Java code by Sarang Noether
 
 #include <stdlib.h>
-#include <openssl/ssl.h>
-#include <openssl/bn.h>
 #include <boost/thread/mutex.hpp>
 #include "misc_log_ex.h"
 #include "common/perf_timer.h"
@@ -289,37 +287,59 @@ static rct::keyV vector_dup(const rct::key &x, size_t N)
   return rct::keyV(N, x);
 }
 
-static rct::key switch_endianness(rct::key k)
+static rct::key sm(rct::key y, int n, const rct::key &x)
 {
-  std::reverse(k.bytes, k.bytes + sizeof(k));
-  return k;
+  while (n--)
+    sc_mul(y.bytes, y.bytes, y.bytes);
+  sc_mul(y.bytes, y.bytes, x.bytes);
+  return y;
 }
 
-/* Compute the inverse of a scalar, the stupid way */
+/* Compute the inverse of a scalar, the clever way */
 static rct::key invert(const rct::key &x)
 {
+  rct::key _1, _10, _100, _11, _101, _111, _1001, _1011, _1111;
+
+  _1 = x;
+  sc_mul(_10.bytes, _1.bytes, _1.bytes);
+  sc_mul(_100.bytes, _10.bytes, _10.bytes);
+  sc_mul(_11.bytes, _10.bytes, _1.bytes);
+  sc_mul(_101.bytes, _10.bytes, _11.bytes);
+  sc_mul(_111.bytes, _10.bytes, _101.bytes);
+  sc_mul(_1001.bytes, _10.bytes, _111.bytes);
+  sc_mul(_1011.bytes, _10.bytes, _1001.bytes);
+  sc_mul(_1111.bytes, _100.bytes, _1011.bytes);
+
   rct::key inv;
+  sc_mul(inv.bytes, _1111.bytes, _1.bytes);
 
-  BN_CTX *ctx = BN_CTX_new();
-  BIGNUM *X = BN_new();
-  BIGNUM *L = BN_new();
-  BIGNUM *I = BN_new();
-
-  BN_bin2bn(switch_endianness(x).bytes, sizeof(rct::key), X);
-  BN_bin2bn(switch_endianness(rct::curveOrder()).bytes, sizeof(rct::key), L);
-
-  CHECK_AND_ASSERT_THROW_MES(BN_mod_inverse(I, X, L, ctx), "Failed to invert");
-
-  const int len = BN_num_bytes(I);
-  CHECK_AND_ASSERT_THROW_MES((size_t)len <= sizeof(rct::key), "Invalid number length");
-  inv = rct::zero();
-  BN_bn2bin(I, inv.bytes);
-  std::reverse(inv.bytes, inv.bytes + len);
-
-  BN_free(I);
-  BN_free(L);
-  BN_free(X);
-  BN_CTX_free(ctx);
+  inv = sm(inv, 123 + 3, _101);
+  inv = sm(inv, 2 + 2, _11);
+  inv = sm(inv, 1 + 4, _1111);
+  inv = sm(inv, 1 + 4, _1111);
+  inv = sm(inv, 4, _1001);
+  inv = sm(inv, 2, _11);
+  inv = sm(inv, 1 + 4, _1111);
+  inv = sm(inv, 1 + 3, _101);
+  inv = sm(inv, 3 + 3, _101);
+  inv = sm(inv, 3, _111);
+  inv = sm(inv, 1 + 4, _1111);
+  inv = sm(inv, 2 + 3, _111);
+  inv = sm(inv, 2 + 2, _11);
+  inv = sm(inv, 1 + 4, _1011);
+  inv = sm(inv, 2 + 4, _1011);
+  inv = sm(inv, 6 + 4, _1001);
+  inv = sm(inv, 2 + 2, _11);
+  inv = sm(inv, 3 + 2, _11);
+  inv = sm(inv, 3 + 2, _11);
+  inv = sm(inv, 1 + 4, _1001);
+  inv = sm(inv, 1 + 3, _111);
+  inv = sm(inv, 2 + 4, _1111);
+  inv = sm(inv, 1 + 4, _1011);
+  inv = sm(inv, 3, _101);
+  inv = sm(inv, 2 + 4, _1111);
+  inv = sm(inv, 3, _101);
+  inv = sm(inv, 1 + 2, _11);
 
 #ifdef DEBUG_BP
   rct::key tmp;
@@ -329,6 +349,34 @@ static rct::key invert(const rct::key &x)
   return inv;
 }
 
+static rct::keyV invert(rct::keyV x)
+{
+  rct::keyV scratch;
+  scratch.reserve(x.size());
+
+  rct::key acc = rct::identity();
+  for (size_t n = 0; n < x.size(); ++n)
+  {
+    scratch.push_back(acc);
+    if (n == 0)
+      acc = x[0];
+    else
+      sc_mul(acc.bytes, acc.bytes, x[n].bytes);
+  }
+
+  acc = invert(acc);
+
+  rct::key tmp;
+  for (int i = x.size(); i-- > 0; )
+  {
+    sc_mul(tmp.bytes, acc.bytes, x[i].bytes);
+    sc_mul(x[i].bytes, acc.bytes, scratch[i].bytes);
+    acc = tmp;
+  }
+
+  return x;
+}
+
 /* Compute the slice of a vector */
 static rct::keyV slice(const rct::keyV &a, size_t start, size_t stop)
 {
@@ -702,6 +750,13 @@ Bulletproof bulletproof_PROVE(const std::vector<uint64_t> &v, const rct::keyV &g
   return bulletproof_PROVE(sv, gamma);
 }
 
+struct proof_data_t
+{
+  rct::key x, y, z, x_ip;
+  std::vector<rct::key> w;
+  size_t logM, inv_offset;
+};
+
 /* Given a range proof, determine if it is valid */
 bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 {
@@ -709,9 +764,17 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
   PERF_TIMER_START_BP(VERIFY);
 
+  const size_t logN = 6;
+  const size_t N = 1 << logN;
+
   // sanity and figure out which proof is longest
   size_t max_length = 0;
   size_t nV = 0;
+  std::vector<proof_data_t> proof_data;
+  proof_data.reserve(proofs.size());
+  size_t inv_offset = 0;
+  std::vector<rct::key> to_invert;
+  to_invert.reserve(11 * sizeof(proofs));
   for (const Bulletproof *p: proofs)
   {
     const Bulletproof &proof = *p;
@@ -729,46 +792,75 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
     max_length = std::max(max_length, proof.L.size());
     nV += proof.V.size();
+
+    // Reconstruct the challenges
+    PERF_TIMER_START_BP(VERIFY_start);
+    proof_data.resize(proof_data.size() + 1);
+    proof_data_t &pd = proof_data.back();
+    rct::key hash_cache = rct::hash_to_scalar(proof.V);
+    pd.y = hash_cache_mash(hash_cache, proof.A, proof.S);
+    CHECK_AND_ASSERT_MES(!(pd.y == rct::zero()), false, "y == 0");
+    pd.z = hash_cache = rct::hash_to_scalar(pd.y);
+    CHECK_AND_ASSERT_MES(!(pd.z == rct::zero()), false, "z == 0");
+    pd.x = hash_cache_mash(hash_cache, pd.z, proof.T1, proof.T2);
+    CHECK_AND_ASSERT_MES(!(pd.x == rct::zero()), false, "x == 0");
+    pd.x_ip = hash_cache_mash(hash_cache, pd.x, proof.taux, proof.mu, proof.t);
+    CHECK_AND_ASSERT_MES(!(pd.x_ip == rct::zero()), false, "x_ip == 0");
+    PERF_TIMER_STOP(VERIFY_start);
+
+    size_t M;
+    for (pd.logM = 0; (M = 1<<pd.logM) <= maxM && M < proof.V.size(); ++pd.logM);
+    CHECK_AND_ASSERT_MES(proof.L.size() == 6+pd.logM, false, "Proof is not the expected size");
+
+    const size_t rounds = pd.logM+logN;
+    CHECK_AND_ASSERT_MES(rounds > 0, false, "Zero rounds");
+
+    PERF_TIMER_START_BP(VERIFY_line_21_22);
+    // PAPER LINES 21-22
+    // The inner product challenges are computed per round
+    pd.w.resize(rounds);
+    for (size_t i = 0; i < rounds; ++i)
+    {
+      pd.w[i] = hash_cache_mash(hash_cache, proof.L[i], proof.R[i]);
+      CHECK_AND_ASSERT_MES(!(pd.w[i] == rct::zero()), false, "w[i] == 0");
+    }
+    PERF_TIMER_STOP(VERIFY_line_21_22);
+
+    pd.inv_offset = inv_offset;
+    for (size_t i = 0; i < rounds; ++i)
+      to_invert.push_back(pd.w[i]);
+    to_invert.push_back(pd.y);
+    inv_offset += rounds + 1;
   }
   CHECK_AND_ASSERT_MES(max_length < 32, false, "At least one proof is too large");
   size_t maxMN = 1u << max_length;
 
-  const size_t logN = 6;
-  const size_t N = 1 << logN;
   rct::key tmp;
 
   std::vector<MultiexpData> multiexp_data;
   multiexp_data.reserve(nV + (2 * (10/*logM*/ + logN) + 4) * proofs.size() + 2 * maxMN);
 
+  PERF_TIMER_START_BP(VERIFY_line_24_25_invert);
+  const std::vector<rct::key> inverses = invert(to_invert);
+  PERF_TIMER_STOP(VERIFY_line_24_25_invert);
+
   // setup weighted aggregates
   rct::key z1 = rct::zero();
   rct::key z3 = rct::zero();
   rct::keyV z4(maxMN, rct::zero()), z5(maxMN, rct::zero());
   rct::key y0 = rct::zero(), y1 = rct::zero();
+  int proof_data_index = 0;
   for (const Bulletproof *p: proofs)
   {
     const Bulletproof &proof = *p;
+    const proof_data_t &pd = proof_data[proof_data_index++];
 
-    size_t M, logM;
-    for (logM = 0; (M = 1<<logM) <= maxM && M < proof.V.size(); ++logM);
-    CHECK_AND_ASSERT_MES(proof.L.size() == 6+logM, false, "Proof is not the expected size");
+    CHECK_AND_ASSERT_MES(proof.L.size() == 6+pd.logM, false, "Proof is not the expected size");
+    const size_t M = 1 << pd.logM;
     const size_t MN = M*N;
     const rct::key weight_y = rct::skGen();
     const rct::key weight_z = rct::skGen();
 
-    // Reconstruct the challenges
-    PERF_TIMER_START_BP(VERIFY_start);
-    rct::key hash_cache = rct::hash_to_scalar(proof.V);
-    rct::key y = hash_cache_mash(hash_cache, proof.A, proof.S);
-    CHECK_AND_ASSERT_MES(!(y == rct::zero()), false, "y == 0");
-    rct::key z = hash_cache = rct::hash_to_scalar(y);
-    CHECK_AND_ASSERT_MES(!(z == rct::zero()), false, "z == 0");
-    rct::key x = hash_cache_mash(hash_cache, z, proof.T1, proof.T2);
-    CHECK_AND_ASSERT_MES(!(x == rct::zero()), false, "x == 0");
-    rct::key x_ip = hash_cache_mash(hash_cache, x, proof.taux, proof.mu, proof.t);
-    CHECK_AND_ASSERT_MES(!(x_ip == rct::zero()), false, "x_ip == 0");
-    PERF_TIMER_STOP(VERIFY_start);
-
     // pre-multiply some points by 8
     rct::keyV proof8_V = proof.V; for (rct::key &k: proof8_V) k = rct::scalarmult8(k);
     rct::keyV proof8_L = proof.L; for (rct::key &k: proof8_L) k = rct::scalarmult8(k);
@@ -782,10 +874,10 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     // PAPER LINE 61
     sc_muladd(y0.bytes, proof.taux.bytes, weight_y.bytes, y0.bytes);
 
-    const rct::keyV zpow = vector_powers(z, M+3);
+    const rct::keyV zpow = vector_powers(pd.z, M+3);
 
     rct::key k;
-    const rct::key ip1y = vector_power_sum(y, MN);
+    const rct::key ip1y = vector_power_sum(pd.y, MN);
     sc_mulsub(k.bytes, zpow[2].bytes, ip1y.bytes, rct::zero().bytes);
     for (size_t j = 1; j <= M; ++j)
     {
@@ -795,7 +887,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     PERF_TIMER_STOP(VERIFY_line_61);
 
     PERF_TIMER_START_BP(VERIFY_line_61rl_new);
-    sc_muladd(tmp.bytes, z.bytes, ip1y.bytes, k.bytes);
+    sc_muladd(tmp.bytes, pd.z.bytes, ip1y.bytes, k.bytes);
     sc_sub(tmp.bytes, proof.t.bytes, tmp.bytes);
     sc_muladd(y1.bytes, tmp.bytes, weight_y.bytes, y1.bytes);
     for (size_t j = 0; j < proof8_V.size(); j++)
@@ -803,10 +895,10 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
       sc_mul(tmp.bytes, zpow[j+2].bytes, weight_y.bytes);
       multiexp_data.emplace_back(tmp, proof8_V[j]);
     }
-    sc_mul(tmp.bytes, x.bytes, weight_y.bytes);
+    sc_mul(tmp.bytes, pd.x.bytes, weight_y.bytes);
     multiexp_data.emplace_back(tmp, proof8_T1);
     rct::key xsq;
-    sc_mul(xsq.bytes, x.bytes, x.bytes);
+    sc_mul(xsq.bytes, pd.x.bytes, pd.x.bytes);
     sc_mul(tmp.bytes, xsq.bytes, weight_y.bytes);
     multiexp_data.emplace_back(tmp, proof8_T2);
     PERF_TIMER_STOP(VERIFY_line_61rl_new);
@@ -814,49 +906,34 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     PERF_TIMER_START_BP(VERIFY_line_62);
     // PAPER LINE 62
     multiexp_data.emplace_back(weight_z, proof8_A);
-    sc_mul(tmp.bytes, x.bytes, weight_z.bytes);
+    sc_mul(tmp.bytes, pd.x.bytes, weight_z.bytes);
     multiexp_data.emplace_back(tmp, proof8_S);
     PERF_TIMER_STOP(VERIFY_line_62);
 
     // Compute the number of rounds for the inner product
-    const size_t rounds = logM+logN;
+    const size_t rounds = pd.logM+logN;
     CHECK_AND_ASSERT_MES(rounds > 0, false, "Zero rounds");
 
-    PERF_TIMER_START_BP(VERIFY_line_21_22);
-    // PAPER LINES 21-22
-    // The inner product challenges are computed per round
-    rct::keyV w(rounds);
-    for (size_t i = 0; i < rounds; ++i)
-    {
-      w[i] = hash_cache_mash(hash_cache, proof.L[i], proof.R[i]);
-      CHECK_AND_ASSERT_MES(!(w[i] == rct::zero()), false, "w[i] == 0");
-    }
-    PERF_TIMER_STOP(VERIFY_line_21_22);
-
     PERF_TIMER_START_BP(VERIFY_line_24_25);
     // Basically PAPER LINES 24-25
     // Compute the curvepoints from G[i] and H[i]
     rct::key yinvpow = rct::identity();
     rct::key ypow = rct::identity();
 
-    PERF_TIMER_START_BP(VERIFY_line_24_25_invert);
-    const rct::key yinv = invert(y);
-    rct::keyV winv(rounds);
-    for (size_t i = 0; i < rounds; ++i)
-      winv[i] = invert(w[i]);
-    PERF_TIMER_STOP(VERIFY_line_24_25_invert);
+    const rct::key *winv = &inverses[pd.inv_offset];
+    const rct::key yinv = inverses[pd.inv_offset + rounds];
 
     // precalc
     PERF_TIMER_START_BP(VERIFY_line_24_25_precalc);
     rct::keyV w_cache(1<<rounds);
     w_cache[0] = winv[0];
-    w_cache[1] = w[0];
+    w_cache[1] = pd.w[0];
     for (size_t j = 1; j < rounds; ++j)
     {
       const size_t slots = 1<<(j+1);
       for (size_t s = slots; s-- > 0; --s)
       {
-        sc_mul(w_cache[s].bytes, w_cache[s/2].bytes, w[j].bytes);
+        sc_mul(w_cache[s].bytes, w_cache[s/2].bytes, pd.w[j].bytes);
         sc_mul(w_cache[s-1].bytes, w_cache[s/2].bytes, winv[j].bytes);
       }
     }
@@ -876,18 +953,18 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
       sc_mul(h_scalar.bytes, h_scalar.bytes, w_cache[(~i) & (MN-1)].bytes);
 
       // Adjust the scalars using the exponents from PAPER LINE 62
-      sc_add(g_scalar.bytes, g_scalar.bytes, z.bytes);
+      sc_add(g_scalar.bytes, g_scalar.bytes, pd.z.bytes);
       CHECK_AND_ASSERT_MES(2+i/N < zpow.size(), false, "invalid zpow index");
       CHECK_AND_ASSERT_MES(i%N < twoN.size(), false, "invalid twoN index");
       sc_mul(tmp.bytes, zpow[2+i/N].bytes, twoN[i%N].bytes);
       if (i == 0)
       {
-        sc_add(tmp.bytes, tmp.bytes, z.bytes);
+        sc_add(tmp.bytes, tmp.bytes, pd.z.bytes);
         sc_sub(h_scalar.bytes, h_scalar.bytes, tmp.bytes);
       }
       else
       {
-        sc_muladd(tmp.bytes, z.bytes, ypow.bytes, tmp.bytes);
+        sc_muladd(tmp.bytes, pd.z.bytes, ypow.bytes, tmp.bytes);
         sc_mulsub(h_scalar.bytes, tmp.bytes, yinvpow.bytes, h_scalar.bytes);
       }
 
@@ -897,12 +974,12 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
       if (i == 0)
       {
         yinvpow = yinv;
-        ypow = y;
+        ypow = pd.y;
       }
       else if (i != MN-1)
       {
         sc_mul(yinvpow.bytes, yinvpow.bytes, yinv.bytes);
-        sc_mul(ypow.bytes, ypow.bytes, y.bytes);
+        sc_mul(ypow.bytes, ypow.bytes, pd.y.bytes);
       }
     }
 
@@ -913,7 +990,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     sc_muladd(z1.bytes, proof.mu.bytes, weight_z.bytes, z1.bytes);
     for (size_t i = 0; i < rounds; ++i)
     {
-      sc_mul(tmp.bytes, w[i].bytes, w[i].bytes);
+      sc_mul(tmp.bytes, pd.w[i].bytes, pd.w[i].bytes);
       sc_mul(tmp.bytes, tmp.bytes, weight_z.bytes);
       multiexp_data.emplace_back(tmp, proof8_L[i]);
       sc_mul(tmp.bytes, winv[i].bytes, winv[i].bytes);
@@ -921,7 +998,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
       multiexp_data.emplace_back(tmp, proof8_R[i]);
     }
     sc_mulsub(tmp.bytes, proof.a.bytes, proof.b.bytes, proof.t.bytes);
-    sc_mul(tmp.bytes, tmp.bytes, x_ip.bytes);
+    sc_mul(tmp.bytes, tmp.bytes, pd.x_ip.bytes);
     sc_muladd(z3.bytes, tmp.bytes, weight_z.bytes, z3.bytes);
     PERF_TIMER_STOP(VERIFY_line_26_new);
   }

From 10e5a9276953ece1f96d6801fe6d91d550c5dfae Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Fri, 24 Aug 2018 18:51:14 +0000
Subject: [PATCH 11/16] bulletproofs: maintain -z4, -z5, and -y0 to avoid
 subtractions

---
 src/ringct/bulletproofs.cc | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index d9961cb20..9e4d85534 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -847,8 +847,8 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
   // setup weighted aggregates
   rct::key z1 = rct::zero();
   rct::key z3 = rct::zero();
-  rct::keyV z4(maxMN, rct::zero()), z5(maxMN, rct::zero());
-  rct::key y0 = rct::zero(), y1 = rct::zero();
+  rct::keyV m_z4(maxMN, rct::zero()), m_z5(maxMN, rct::zero());
+  rct::key m_y0 = rct::zero(), y1 = rct::zero();
   int proof_data_index = 0;
   for (const Bulletproof *p: proofs)
   {
@@ -872,7 +872,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
     PERF_TIMER_START_BP(VERIFY_line_61);
     // PAPER LINE 61
-    sc_muladd(y0.bytes, proof.taux.bytes, weight_y.bytes, y0.bytes);
+    sc_mulsub(m_y0.bytes, proof.taux.bytes, weight_y.bytes, m_y0.bytes);
 
     const rct::keyV zpow = vector_powers(pd.z, M+3);
 
@@ -968,8 +968,8 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
         sc_mulsub(h_scalar.bytes, tmp.bytes, yinvpow.bytes, h_scalar.bytes);
       }
 
-      sc_muladd(z4[i].bytes, g_scalar.bytes, weight_z.bytes, z4[i].bytes);
-      sc_muladd(z5[i].bytes, h_scalar.bytes, weight_z.bytes, z5[i].bytes);
+      sc_mulsub(m_z4[i].bytes, g_scalar.bytes, weight_z.bytes, m_z4[i].bytes);
+      sc_mulsub(m_z5[i].bytes, h_scalar.bytes, weight_z.bytes, m_z5[i].bytes);
 
       if (i == 0)
       {
@@ -1005,17 +1005,14 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
   // now check all proofs at once
   PERF_TIMER_START_BP(VERIFY_step2_check);
-  sc_sub(tmp.bytes, rct::zero().bytes, y0.bytes);
-  sc_sub(tmp.bytes, tmp.bytes, z1.bytes);
+  sc_sub(tmp.bytes, m_y0.bytes, z1.bytes);
   multiexp_data.emplace_back(tmp, rct::G);
   sc_sub(tmp.bytes, z3.bytes, y1.bytes);
   multiexp_data.emplace_back(tmp, rct::H);
   for (size_t i = 0; i < maxMN; ++i)
   {
-    sc_sub(tmp.bytes, rct::zero().bytes, z4[i].bytes);
-    multiexp_data.emplace_back(tmp, Gi_p3[i]);
-    sc_sub(tmp.bytes, rct::zero().bytes, z5[i].bytes);
-    multiexp_data.emplace_back(tmp, Hi_p3[i]);
+    multiexp_data.emplace_back(m_z4[i], Gi_p3[i]);
+    multiexp_data.emplace_back(m_z5[i], Hi_p3[i]);
   }
   if (!(multiexp(multiexp_data, false) == rct::identity()))
   {

From 6f9ae5b6eb535fb748722ed1c81f981474422ead Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Fri, 24 Aug 2018 21:20:56 +0000
Subject: [PATCH 12/16] multiexp: handle pippenger multiexps with part precalc

---
 src/ringct/bulletproofs.cc         | 21 +++++++++++----------
 src/ringct/multiexp.cc             | 21 ++++++++++++++-------
 src/ringct/multiexp.h              |  4 ++--
 tests/performance_tests/multiexp.h |  4 ++--
 4 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 9e4d85534..a3af9264e 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -73,15 +73,15 @@ static const rct::keyV twoN = vector_powers(TWO, maxN);
 static const rct::key ip12 = inner_product(oneN, twoN);
 static boost::mutex init_mutex;
 
-static inline rct::key multiexp(const std::vector<MultiexpData> &data, bool HiGi)
+static inline rct::key multiexp(const std::vector<MultiexpData> &data, size_t HiGi_size)
 {
-  if (HiGi)
+  if (HiGi_size > 0)
   {
     static_assert(128 <= STRAUS_SIZE_LIMIT, "Straus in precalc mode can only be calculated till STRAUS_SIZE_LIMIT");
-    return data.size() <= 128 ? straus(data, straus_HiGi_cache, 0) : pippenger(data, pippenger_HiGi_cache, get_pippenger_c(data.size()));
+    return HiGi_size <= 128 && data.size() == HiGi_size ? straus(data, straus_HiGi_cache, 0) : pippenger(data, pippenger_HiGi_cache, HiGi_size, get_pippenger_c(data.size()));
   }
   else
-    return data.size() <= 64 ? straus(data, NULL, 0) : pippenger(data, NULL, get_pippenger_c(data.size()));
+    return data.size() <= 64 ? straus(data, NULL, 0) : pippenger(data, NULL, 0, get_pippenger_c(data.size()));
 }
 
 static inline bool is_reduced(const rct::key &scalar)
@@ -118,7 +118,7 @@ static void init_exponents()
   }
 
   straus_HiGi_cache = straus_init_cache(data, STRAUS_SIZE_LIMIT);
-  pippenger_HiGi_cache = pippenger_init_cache(data, PIPPENGER_SIZE_LIMIT);
+  pippenger_HiGi_cache = pippenger_init_cache(data, 0, PIPPENGER_SIZE_LIMIT);
 
   MINFO("Hi/Gi cache size: " << (sizeof(Hi)+sizeof(Gi))/1024 << " kB");
   MINFO("Hi_p3/Gi_p3 cache size: " << (sizeof(Hi_p3)+sizeof(Gi_p3))/1024 << " kB");
@@ -142,7 +142,7 @@ static rct::key vector_exponent(const rct::keyV &a, const rct::keyV &b)
     multiexp_data.emplace_back(a[i], Gi_p3[i]);
     multiexp_data.emplace_back(b[i], Hi_p3[i]);
   }
-  return multiexp(multiexp_data, true);
+  return multiexp(multiexp_data, 2 * a.size());
 }
 
 /* Compute a custom vector-scalar commitment */
@@ -169,7 +169,7 @@ static rct::key cross_vector_exponent8(size_t size, const std::vector<ge_p3> &A,
     sc_mul(multiexp_data.back().scalar.bytes, extra_scalar->bytes, INV_EIGHT.bytes);
     multiexp_data.back().point = *extra_point;
   }
-  return multiexp(multiexp_data, false);
+  return multiexp(multiexp_data, 0);
 }
 
 /* Given a scalar, construct a vector of powers */
@@ -839,6 +839,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
   std::vector<MultiexpData> multiexp_data;
   multiexp_data.reserve(nV + (2 * (10/*logM*/ + logN) + 4) * proofs.size() + 2 * maxMN);
+  multiexp_data.resize(2 * maxMN);
 
   PERF_TIMER_START_BP(VERIFY_line_24_25_invert);
   const std::vector<rct::key> inverses = invert(to_invert);
@@ -1011,10 +1012,10 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
   multiexp_data.emplace_back(tmp, rct::H);
   for (size_t i = 0; i < maxMN; ++i)
   {
-    multiexp_data.emplace_back(m_z4[i], Gi_p3[i]);
-    multiexp_data.emplace_back(m_z5[i], Hi_p3[i]);
+    multiexp_data[i * 2] = {m_z4[i], Gi_p3[i]};
+    multiexp_data[i * 2 + 1] = {m_z5[i], Hi_p3[i]};
   }
-  if (!(multiexp(multiexp_data, false) == rct::identity()))
+  if (!(multiexp(multiexp_data, 2 * maxMN) == rct::identity()))
   {
     PERF_TIMER_STOP(VERIFY_step2_check);
     MERROR("Verification failure");
diff --git a/src/ringct/multiexp.cc b/src/ringct/multiexp.cc
index fb2f18551..178f92267 100644
--- a/src/ringct/multiexp.cc
+++ b/src/ringct/multiexp.cc
@@ -564,12 +564,13 @@ struct pippenger_cached_data
   ~pippenger_cached_data() { aligned_free(cached); }
 };
 
-std::shared_ptr<pippenger_cached_data> pippenger_init_cache(const std::vector<MultiexpData> &data, size_t N)
+std::shared_ptr<pippenger_cached_data> pippenger_init_cache(const std::vector<MultiexpData> &data, size_t start_offset, size_t N)
 {
   MULTIEXP_PERF(PERF_TIMER_START_UNIT(pippenger_init_cache, 1000000));
+  CHECK_AND_ASSERT_THROW_MES(start_offset <= data.size(), "Bad cache base data");
   if (N == 0)
-    N = data.size();
-  CHECK_AND_ASSERT_THROW_MES(N <= data.size(), "Bad cache base data");
+    N = data.size() - start_offset;
+  CHECK_AND_ASSERT_THROW_MES(N <= data.size() - start_offset, "Bad cache base data");
   ge_cached cached;
   std::shared_ptr<pippenger_cached_data> cache(new pippenger_cached_data());
 
@@ -577,7 +578,7 @@ std::shared_ptr<pippenger_cached_data> pippenger_init_cache(const std::vector<Mu
   cache->cached = (ge_cached*)aligned_realloc(cache->cached, N * sizeof(ge_cached), 4096);
   CHECK_AND_ASSERT_THROW_MES(cache->cached, "Out of memory");
   for (size_t i = 0; i < N; ++i)
-    ge_p3_to_cached(&cache->cached[i], &data[i].point);
+    ge_p3_to_cached(&cache->cached[i], &data[i+start_offset].point);
 
   MULTIEXP_PERF(PERF_TIMER_STOP(pippenger_init_cache));
   return cache;
@@ -588,9 +589,11 @@ size_t pippenger_get_cache_size(const std::shared_ptr<pippenger_cached_data> &ca
   return cache->size * sizeof(*cache->cached);
 }
 
-rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<pippenger_cached_data> &cache, size_t c)
+rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<pippenger_cached_data> &cache, size_t cache_size, size_t c)
 {
-  CHECK_AND_ASSERT_THROW_MES(cache == NULL || cache->size >= data.size(), "Cache is too small");
+  if (cache != NULL && cache_size == 0)
+    cache_size = cache->size;
+  CHECK_AND_ASSERT_THROW_MES(cache == NULL || cache_size <= cache->size, "Cache is too small");
   if (c == 0)
     c = get_pippenger_c(data.size());
   CHECK_AND_ASSERT_THROW_MES(c <= 9, "c is too large");
@@ -598,6 +601,7 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<
   ge_p3 result = ge_p3_identity;
   std::unique_ptr<ge_p3[]> buckets{new ge_p3[1<<c]};
   std::shared_ptr<pippenger_cached_data> local_cache = cache == NULL ? pippenger_init_cache(data) : cache;
+  std::shared_ptr<pippenger_cached_data> local_cache_2 = data.size() > cache_size ? pippenger_init_cache(data, cache_size) : NULL;
 
   rct::key maxscalar = rct::zero();
   for (size_t i = 0; i < data.size(); ++i)
@@ -641,7 +645,10 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<
       CHECK_AND_ASSERT_THROW_MES(bucket < (1u<<c), "bucket overflow");
       if (!ge_p3_is_point_at_infinity(&buckets[bucket]))
       {
-        add(buckets[bucket], local_cache->cached[i]);
+        if (i < cache_size)
+          add(buckets[bucket], local_cache->cached[i]);
+        else
+          add(buckets[bucket], local_cache_2->cached[i - cache_size]);
       }
       else
         buckets[bucket] = data[i].point;
diff --git a/src/ringct/multiexp.h b/src/ringct/multiexp.h
index 559ab664a..b52707933 100644
--- a/src/ringct/multiexp.h
+++ b/src/ringct/multiexp.h
@@ -61,10 +61,10 @@ rct::key bos_coster_heap_conv_robust(std::vector<MultiexpData> data);
 std::shared_ptr<straus_cached_data> straus_init_cache(const std::vector<MultiexpData> &data, size_t N =0);
 size_t straus_get_cache_size(const std::shared_ptr<straus_cached_data> &cache);
 rct::key straus(const std::vector<MultiexpData> &data, const std::shared_ptr<straus_cached_data> &cache = NULL, size_t STEP = 0);
-std::shared_ptr<pippenger_cached_data> pippenger_init_cache(const std::vector<MultiexpData> &data, size_t N =0);
+std::shared_ptr<pippenger_cached_data> pippenger_init_cache(const std::vector<MultiexpData> &data, size_t start_offset = 0, size_t N =0);
 size_t pippenger_get_cache_size(const std::shared_ptr<pippenger_cached_data> &cache);
 size_t get_pippenger_c(size_t N);
-rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<pippenger_cached_data> &cache = NULL, size_t c = 0);
+rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<pippenger_cached_data> &cache = NULL, size_t cache_size = 0, size_t c = 0);
 
 }
 
diff --git a/tests/performance_tests/multiexp.h b/tests/performance_tests/multiexp.h
index b8b87b3a6..b6e86ebd4 100644
--- a/tests/performance_tests/multiexp.h
+++ b/tests/performance_tests/multiexp.h
@@ -78,9 +78,9 @@ public:
       case multiexp_straus_cached:
         return res == straus(data, straus_cache);
       case multiexp_pippenger:
-        return res == pippenger(data, NULL, c);
+        return res == pippenger(data, NULL, 0, c);
       case multiexp_pippenger_cached:
-        return res == pippenger(data, pippenger_cache, c);
+        return res == pippenger(data, pippenger_cache, 0, c);
       default:
         return false;
     }

From 8b4767221c9b0ff3015229b167e5be1331a16c12 Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Sat, 25 Aug 2018 18:37:21 +0000
Subject: [PATCH 13/16] bulletproofs: speedup prover

---
 src/ringct/bulletproofs.cc | 54 ++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index a3af9264e..f1b821978 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -146,13 +146,14 @@ static rct::key vector_exponent(const rct::keyV &a, const rct::keyV &b)
 }
 
 /* Compute a custom vector-scalar commitment */
-static rct::key cross_vector_exponent8(size_t size, const std::vector<ge_p3> &A, size_t Ao, const std::vector<ge_p3> &B, size_t Bo, const rct::keyV &a, size_t ao, const rct::keyV &b, size_t bo, const ge_p3 *extra_point, const rct::key *extra_scalar)
+static rct::key cross_vector_exponent8(size_t size, const std::vector<ge_p3> &A, size_t Ao, const std::vector<ge_p3> &B, size_t Bo, const rct::keyV &a, size_t ao, const rct::keyV &b, size_t bo, const rct::keyV *scale, const ge_p3 *extra_point, const rct::key *extra_scalar)
 {
   CHECK_AND_ASSERT_THROW_MES(size + Ao <= A.size(), "Incompatible size for A");
   CHECK_AND_ASSERT_THROW_MES(size + Bo <= B.size(), "Incompatible size for B");
   CHECK_AND_ASSERT_THROW_MES(size + ao <= a.size(), "Incompatible size for a");
   CHECK_AND_ASSERT_THROW_MES(size + bo <= b.size(), "Incompatible size for b");
   CHECK_AND_ASSERT_THROW_MES(size <= maxN*maxM, "size is too large");
+  CHECK_AND_ASSERT_THROW_MES(!scale || size == scale->size() / 2, "Incompatible size for scale");
   CHECK_AND_ASSERT_THROW_MES(!!extra_point == !!extra_scalar, "only one of extra point/scalar present");
 
   std::vector<MultiexpData> multiexp_data;
@@ -162,6 +163,8 @@ static rct::key cross_vector_exponent8(size_t size, const std::vector<ge_p3> &A,
     sc_mul(multiexp_data[i*2].scalar.bytes, a[ao+i].bytes, INV_EIGHT.bytes);;
     multiexp_data[i*2].point = A[Ao+i];
     sc_mul(multiexp_data[i*2+1].scalar.bytes, b[bo+i].bytes, INV_EIGHT.bytes);
+    if (scale)
+      sc_mul(multiexp_data[i*2+1].scalar.bytes, multiexp_data[i*2+1].scalar.bytes, (*scale)[Bo+i].bytes);
     multiexp_data[i*2+1].point = B[Bo+i];
   }
   if (extra_point)
@@ -232,7 +235,7 @@ static rct::keyV hadamard(const rct::keyV &a, const rct::keyV &b)
 }
 
 /* folds a curvepoint array using a two way scaled Hadamard product */
-static void hadamard_fold(std::vector<ge_p3> &v, const rct::key &a, const rct::key &b)
+static void hadamard_fold(std::vector<ge_p3> &v, const rct::keyV *scale, const rct::key &a, const rct::key &b)
 {
   CHECK_AND_ASSERT_THROW_MES((v.size() & 1) == 0, "Vector size should be even");
   const size_t sz = v.size() / 2;
@@ -241,7 +244,10 @@ static void hadamard_fold(std::vector<ge_p3> &v, const rct::key &a, const rct::k
     ge_dsmp c[2];
     ge_dsm_precomp(c[0], &v[n]);
     ge_dsm_precomp(c[1], &v[sz + n]);
-    ge_double_scalarmult_precomp_vartime2_p3(&v[n], a.bytes, c[0], b.bytes, c[1]);
+    rct::key sa, sb;
+    if (scale) sc_mul(sa.bytes, a.bytes, (*scale)[n].bytes); else sa = a;
+    if (scale) sc_mul(sb.bytes, b.bytes, (*scale)[sz + n].bytes); else sb = b;
+    ge_double_scalarmult_precomp_vartime2_p3(&v[n], sa.bytes, c[0], sb.bytes, c[1]);
   }
   v.resize(sz);
 }
@@ -258,14 +264,24 @@ static rct::keyV vector_add(const rct::keyV &a, const rct::keyV &b)
   return res;
 }
 
-/* Subtract two vectors */
-static rct::keyV vector_subtract(const rct::keyV &a, const rct::keyV &b)
+/* Add a scalar to all elements of a vector */
+static rct::keyV vector_add(const rct::keyV &a, const rct::key &b)
 {
-  CHECK_AND_ASSERT_THROW_MES(a.size() == b.size(), "Incompatible sizes of a and b");
   rct::keyV res(a.size());
   for (size_t i = 0; i < a.size(); ++i)
   {
-    sc_sub(res[i].bytes, a[i].bytes, b[i].bytes);
+    sc_add(res[i].bytes, a[i].bytes, b.bytes);
+  }
+  return res;
+}
+
+/* Subtract a scalar from all elements of a vector */
+static rct::keyV vector_subtract(const rct::keyV &a, const rct::key &b)
+{
+  rct::keyV res(a.size());
+  for (size_t i = 0; i < a.size(); ++i)
+  {
+    sc_sub(res[i].bytes, a[i].bytes, b.bytes);
   }
   return res;
 }
@@ -549,8 +565,7 @@ try_again:
   }
 
   // Polynomial construction by coefficients
-  const auto zMN = vector_dup(z, MN);
-  rct::keyV l0 = vector_subtract(aL, zMN);
+  rct::keyV l0 = vector_subtract(aL, z);
   const rct::keyV &l1 = sL;
 
   // This computes the ugly sum/concatenation from PAPER LINE 65
@@ -570,7 +585,7 @@ try_again:
     }
   }
 
-  rct::keyV r0 = vector_add(aR, zMN);
+  rct::keyV r0 = vector_add(aR, z);
   const auto yMN = vector_powers(y, MN);
   r0 = hadamard(r0, yMN);
   r0 = vector_add(r0, zero_twos);
@@ -658,12 +673,15 @@ try_again:
   rct::keyV aprime(MN);
   rct::keyV bprime(MN);
   const rct::key yinv = invert(y);
-  rct::key yinvpow = rct::identity();
+  rct::keyV yinvpow(MN);
+  yinvpow[0] = rct::identity();
+  yinvpow[1] = yinv;
   for (size_t i = 0; i < MN; ++i)
   {
     Gprime[i] = Gi_p3[i];
-    ge_scalarmult_p3(&Hprime[i], yinvpow.bytes, &Hi_p3[i]);
-    sc_mul(yinvpow.bytes, yinvpow.bytes, yinv.bytes);
+    Hprime[i] = Hi_p3[i];
+    if (i > 1)
+      sc_mul(yinvpow[i].bytes, yinvpow[i-1].bytes, yinv.bytes);
     aprime[i] = l[i];
     bprime[i] = r[i];
   }
@@ -675,6 +693,7 @@ try_again:
 
   PERF_TIMER_START_BP(PROVE_step4);
   // PAPER LINE 13
+  const rct::keyV *scale = &yinvpow;
   while (nprime > 1)
   {
     // PAPER LINE 15
@@ -689,9 +708,9 @@ try_again:
     // PAPER LINES 18-19
     PERF_TIMER_START_BP(PROVE_LR);
     sc_mul(tmp.bytes, cL.bytes, x_ip.bytes);
-    L[round] = cross_vector_exponent8(nprime, Gprime, nprime, Hprime, 0, aprime, 0, bprime, nprime, &ge_p3_H, &tmp);
+    L[round] = cross_vector_exponent8(nprime, Gprime, nprime, Hprime, 0, aprime, 0, bprime, nprime, scale, &ge_p3_H, &tmp);
     sc_mul(tmp.bytes, cR.bytes, x_ip.bytes);
-    R[round] = cross_vector_exponent8(nprime, Gprime, 0, Hprime, nprime, aprime, nprime, bprime, 0, &ge_p3_H, &tmp);
+    R[round] = cross_vector_exponent8(nprime, Gprime, 0, Hprime, nprime, aprime, nprime, bprime, 0, scale, &ge_p3_H, &tmp);
     PERF_TIMER_STOP(PROVE_LR);
 
     // PAPER LINES 21-22
@@ -708,8 +727,8 @@ try_again:
     if (nprime > 1)
     {
       PERF_TIMER_START_BP(PROVE_hadamard2);
-      hadamard_fold(Gprime, winv, w[round]);
-      hadamard_fold(Hprime, w[round], winv);
+      hadamard_fold(Gprime, NULL, winv, w[round]);
+      hadamard_fold(Hprime, scale, w[round], winv);
       PERF_TIMER_STOP(PROVE_hadamard2);
     }
 
@@ -719,6 +738,7 @@ try_again:
     bprime = vector_add(vector_scalar(slice(bprime, 0, nprime), winv), vector_scalar(slice(bprime, nprime, bprime.size()), w[round]));
     PERF_TIMER_STOP(PROVE_prime);
 
+    scale = NULL;
     ++round;
   }
   PERF_TIMER_STOP(PROVE_step4);

From a110e6aa18928fb0bc13d0e39c854aea9e99e96d Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Mon, 27 Aug 2018 13:23:57 +0000
Subject: [PATCH 14/16] multiexp: tune which variants to use for which number
 of points

---
 src/ringct/bulletproofs.cc |  8 ++++----
 src/ringct/multiexp.cc     | 36 ++++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index f1b821978..4e1c940d1 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -48,7 +48,7 @@ extern "C"
 
 #define PERF_TIMER_START_BP(x) PERF_TIMER_START_UNIT(x, 1000000)
 
-#define STRAUS_SIZE_LIMIT 128
+#define STRAUS_SIZE_LIMIT 232
 #define PIPPENGER_SIZE_LIMIT 0
 
 namespace rct
@@ -77,11 +77,11 @@ static inline rct::key multiexp(const std::vector<MultiexpData> &data, size_t Hi
 {
   if (HiGi_size > 0)
   {
-    static_assert(128 <= STRAUS_SIZE_LIMIT, "Straus in precalc mode can only be calculated till STRAUS_SIZE_LIMIT");
-    return HiGi_size <= 128 && data.size() == HiGi_size ? straus(data, straus_HiGi_cache, 0) : pippenger(data, pippenger_HiGi_cache, HiGi_size, get_pippenger_c(data.size()));
+    static_assert(232 <= STRAUS_SIZE_LIMIT, "Straus in precalc mode can only be calculated till STRAUS_SIZE_LIMIT");
+    return HiGi_size <= 232 && data.size() == HiGi_size ? straus(data, straus_HiGi_cache, 0) : pippenger(data, pippenger_HiGi_cache, HiGi_size, get_pippenger_c(data.size()));
   }
   else
-    return data.size() <= 64 ? straus(data, NULL, 0) : pippenger(data, NULL, 0, get_pippenger_c(data.size()));
+    return data.size() <= 95 ? straus(data, NULL, 0) : pippenger(data, NULL, 0, get_pippenger_c(data.size()));
 }
 
 static inline bool is_reduced(const rct::key &scalar)
diff --git a/src/ringct/multiexp.cc b/src/ringct/multiexp.cc
index 178f92267..85b1dfed4 100644
--- a/src/ringct/multiexp.cc
+++ b/src/ringct/multiexp.cc
@@ -79,6 +79,25 @@ extern "C"
 // Best/cached		Straus	Straus	Straus	Straus	Straus	Straus	Straus	Straus	Pip	Pip	Pip	Pip
 // Best/uncached	Straus	Straus	Straus	Straus	Straus	Straus	Pip	Pip	Pip	Pip	Pip	Pip
 
+// New timings:
+//   Pippenger:
+//     2/1 always
+//     3/2 at ~13
+//     4/3 at ~29
+//     5/4 at ~83
+//     6/5 < 200
+//     7/6 at ~470
+//     8/7 at ~1180
+//     9/8 at ~2290
+//   Cached Pippenger:
+//     6/5 < 200
+//     7/6 at 460
+//     8/7 at 1180
+//     9/8 at 2300
+//
+//     Cached Straus/Pippenger cross at 232
+//
+
 namespace rct
 {
 
@@ -543,16 +562,13 @@ skipfirst:
 
 size_t get_pippenger_c(size_t N)
 {
-// uncached: 2:1, 4:2, 8:2, 16:3, 32:4, 64:4, 128:5, 256:6, 512:7, 1024:7, 2048:8, 4096:9
-//   cached: 2:1, 4:2, 8:2, 16:3, 32:4, 64:4, 128:5, 256:6, 512:7, 1024:7, 2048:8, 4096:9
-  if (N <= 2) return 1;
-  if (N <= 8) return 2;
-  if (N <= 16) return 3;
-  if (N <= 64) return 4;
-  if (N <= 128) return 5;
-  if (N <= 256) return 6;
-  if (N <= 1024) return 7;
-  if (N <= 2048) return 8;
+  if (N <= 13) return 2;
+  if (N <= 29) return 3;
+  if (N <= 83) return 4;
+  if (N <= 185) return 5;
+  if (N <= 465) return 6;
+  if (N <= 1180) return 7;
+  if (N <= 2295) return 8;
   return 9;
 }
 

From a6d2e246f37d5a91a33defb6f156b229f6274531 Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Sat, 1 Sep 2018 17:46:34 +0000
Subject: [PATCH 15/16] bulletproofs: only enable profiling on request

---
 src/ringct/bulletproofs.cc | 62 +++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc
index 4e1c940d1..bed48769a 100644
--- a/src/ringct/bulletproofs.cc
+++ b/src/ringct/bulletproofs.cc
@@ -46,7 +46,13 @@ extern "C"
 
 //#define DEBUG_BP
 
+#if 1
 #define PERF_TIMER_START_BP(x) PERF_TIMER_START_UNIT(x, 1000000)
+#define PERF_TIMER_STOP_BP(x) PERF_TIMER_STOP(x)
+#else
+#define PERF_TIMER_START_BP(x) ((void*)0)
+#define PERF_TIMER_STOP_BP(x) ((void*)0)
+#endif
 
 #define STRAUS_SIZE_LIMIT 232
 #define PIPPENGER_SIZE_LIMIT 0
@@ -486,7 +492,7 @@ Bulletproof bulletproof_PROVE(const rct::keyV &sv, const rct::keyV &gamma)
     sc_mul(sv8.bytes, sv[i].bytes, INV_EIGHT.bytes);
     rct::addKeys2(V[i], gamma8, sv8, rct::H);
   }
-  PERF_TIMER_STOP(PROVE_v);
+  PERF_TIMER_STOP_BP(PROVE_v);
 
   PERF_TIMER_START_BP(PROVE_aLaR);
   for (size_t j = 0; j < M; ++j)
@@ -507,7 +513,7 @@ Bulletproof bulletproof_PROVE(const rct::keyV &sv, const rct::keyV &gamma)
       }
     }
   }
-  PERF_TIMER_STOP(PROVE_aLaR);
+  PERF_TIMER_STOP_BP(PROVE_aLaR);
 
   // DEBUG: Test to ensure this recovers the value
 #ifdef DEBUG_BP
@@ -552,14 +558,14 @@ try_again:
   rct::key y = hash_cache_mash(hash_cache, A, S);
   if (y == rct::zero())
   {
-    PERF_TIMER_STOP(PROVE_step1);
+    PERF_TIMER_STOP_BP(PROVE_step1);
     MINFO("y is 0, trying again");
     goto try_again;
   }
   rct::key z = hash_cache = rct::hash_to_scalar(y);
   if (z == rct::zero())
   {
-    PERF_TIMER_STOP(PROVE_step1);
+    PERF_TIMER_STOP_BP(PROVE_step1);
     MINFO("z is 0, trying again");
     goto try_again;
   }
@@ -598,7 +604,7 @@ try_again:
   sc_add(t1.bytes, t1_1.bytes, t1_2.bytes);
   rct::key t2 = inner_product(l1, r1);
 
-  PERF_TIMER_STOP(PROVE_step1);
+  PERF_TIMER_STOP_BP(PROVE_step1);
 
   PERF_TIMER_START_BP(PROVE_step2);
   // PAPER LINES 47-48
@@ -619,7 +625,7 @@ try_again:
   rct::key x = hash_cache_mash(hash_cache, z, T1, T2);
   if (x == rct::zero())
   {
-    PERF_TIMER_STOP(PROVE_step2);
+    PERF_TIMER_STOP_BP(PROVE_step2);
     MINFO("x is 0, trying again");
     goto try_again;
   }
@@ -643,7 +649,7 @@ try_again:
   l = vector_add(l, vector_scalar(l1, x));
   rct::keyV r = r0;
   r = vector_add(r, vector_scalar(r1, x));
-  PERF_TIMER_STOP(PROVE_step2);
+  PERF_TIMER_STOP_BP(PROVE_step2);
 
   PERF_TIMER_START_BP(PROVE_step3);
   rct::key t = inner_product(l, r);
@@ -661,7 +667,7 @@ try_again:
   rct::key x_ip = hash_cache_mash(hash_cache, x, taux, mu, t);
   if (x_ip == rct::zero())
   {
-    PERF_TIMER_STOP(PROVE_step3);
+    PERF_TIMER_STOP_BP(PROVE_step3);
     MINFO("x_ip is 0, trying again");
     goto try_again;
   }
@@ -689,7 +695,7 @@ try_again:
   rct::keyV R(logMN);
   int round = 0;
   rct::keyV w(logMN); // this is the challenge x in the inner product protocol
-  PERF_TIMER_STOP(PROVE_step3);
+  PERF_TIMER_STOP_BP(PROVE_step3);
 
   PERF_TIMER_START_BP(PROVE_step4);
   // PAPER LINE 13
@@ -703,7 +709,7 @@ try_again:
     PERF_TIMER_START_BP(PROVE_inner_product);
     rct::key cL = inner_product(slice(aprime, 0, nprime), slice(bprime, nprime, bprime.size()));
     rct::key cR = inner_product(slice(aprime, nprime, aprime.size()), slice(bprime, 0, nprime));
-    PERF_TIMER_STOP(PROVE_inner_product);
+    PERF_TIMER_STOP_BP(PROVE_inner_product);
 
     // PAPER LINES 18-19
     PERF_TIMER_START_BP(PROVE_LR);
@@ -711,13 +717,13 @@ try_again:
     L[round] = cross_vector_exponent8(nprime, Gprime, nprime, Hprime, 0, aprime, 0, bprime, nprime, scale, &ge_p3_H, &tmp);
     sc_mul(tmp.bytes, cR.bytes, x_ip.bytes);
     R[round] = cross_vector_exponent8(nprime, Gprime, 0, Hprime, nprime, aprime, nprime, bprime, 0, scale, &ge_p3_H, &tmp);
-    PERF_TIMER_STOP(PROVE_LR);
+    PERF_TIMER_STOP_BP(PROVE_LR);
 
     // PAPER LINES 21-22
     w[round] = hash_cache_mash(hash_cache, L[round], R[round]);
     if (w[round] == rct::zero())
     {
-      PERF_TIMER_STOP(PROVE_step4);
+      PERF_TIMER_STOP_BP(PROVE_step4);
       MINFO("w[round] is 0, trying again");
       goto try_again;
     }
@@ -729,19 +735,19 @@ try_again:
       PERF_TIMER_START_BP(PROVE_hadamard2);
       hadamard_fold(Gprime, NULL, winv, w[round]);
       hadamard_fold(Hprime, scale, w[round], winv);
-      PERF_TIMER_STOP(PROVE_hadamard2);
+      PERF_TIMER_STOP_BP(PROVE_hadamard2);
     }
 
     // PAPER LINES 28-29
     PERF_TIMER_START_BP(PROVE_prime);
     aprime = vector_add(vector_scalar(slice(aprime, 0, nprime), w[round]), vector_scalar(slice(aprime, nprime, aprime.size()), winv));
     bprime = vector_add(vector_scalar(slice(bprime, 0, nprime), winv), vector_scalar(slice(bprime, nprime, bprime.size()), w[round]));
-    PERF_TIMER_STOP(PROVE_prime);
+    PERF_TIMER_STOP_BP(PROVE_prime);
 
     scale = NULL;
     ++round;
   }
-  PERF_TIMER_STOP(PROVE_step4);
+  PERF_TIMER_STOP_BP(PROVE_step4);
 
   // PAPER LINE 58 (with inclusions from PAPER LINE 8 and PAPER LINE 20)
   return Bulletproof(std::move(V), A, S, T1, T2, taux, mu, std::move(L), std::move(R), aprime[0], bprime[0], t);
@@ -766,7 +772,7 @@ Bulletproof bulletproof_PROVE(const std::vector<uint64_t> &v, const rct::keyV &g
     sv[i].bytes[6] = (v[i] >> 48) & 255;
     sv[i].bytes[7] = (v[i] >> 56) & 255;
   }
-  PERF_TIMER_STOP(PROVE_v);
+  PERF_TIMER_STOP_BP(PROVE_v);
   return bulletproof_PROVE(sv, gamma);
 }
 
@@ -826,7 +832,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     CHECK_AND_ASSERT_MES(!(pd.x == rct::zero()), false, "x == 0");
     pd.x_ip = hash_cache_mash(hash_cache, pd.x, proof.taux, proof.mu, proof.t);
     CHECK_AND_ASSERT_MES(!(pd.x_ip == rct::zero()), false, "x_ip == 0");
-    PERF_TIMER_STOP(VERIFY_start);
+    PERF_TIMER_STOP_BP(VERIFY_start);
 
     size_t M;
     for (pd.logM = 0; (M = 1<<pd.logM) <= maxM && M < proof.V.size(); ++pd.logM);
@@ -844,7 +850,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
       pd.w[i] = hash_cache_mash(hash_cache, proof.L[i], proof.R[i]);
       CHECK_AND_ASSERT_MES(!(pd.w[i] == rct::zero()), false, "w[i] == 0");
     }
-    PERF_TIMER_STOP(VERIFY_line_21_22);
+    PERF_TIMER_STOP_BP(VERIFY_line_21_22);
 
     pd.inv_offset = inv_offset;
     for (size_t i = 0; i < rounds; ++i)
@@ -863,7 +869,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
 
   PERF_TIMER_START_BP(VERIFY_line_24_25_invert);
   const std::vector<rct::key> inverses = invert(to_invert);
-  PERF_TIMER_STOP(VERIFY_line_24_25_invert);
+  PERF_TIMER_STOP_BP(VERIFY_line_24_25_invert);
 
   // setup weighted aggregates
   rct::key z1 = rct::zero();
@@ -905,7 +911,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
       CHECK_AND_ASSERT_MES(j+2 < zpow.size(), false, "invalid zpow index");
       sc_mulsub(k.bytes, zpow[j+2].bytes, ip12.bytes, k.bytes);
     }
-    PERF_TIMER_STOP(VERIFY_line_61);
+    PERF_TIMER_STOP_BP(VERIFY_line_61);
 
     PERF_TIMER_START_BP(VERIFY_line_61rl_new);
     sc_muladd(tmp.bytes, pd.z.bytes, ip1y.bytes, k.bytes);
@@ -922,14 +928,14 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     sc_mul(xsq.bytes, pd.x.bytes, pd.x.bytes);
     sc_mul(tmp.bytes, xsq.bytes, weight_y.bytes);
     multiexp_data.emplace_back(tmp, proof8_T2);
-    PERF_TIMER_STOP(VERIFY_line_61rl_new);
+    PERF_TIMER_STOP_BP(VERIFY_line_61rl_new);
 
     PERF_TIMER_START_BP(VERIFY_line_62);
     // PAPER LINE 62
     multiexp_data.emplace_back(weight_z, proof8_A);
     sc_mul(tmp.bytes, pd.x.bytes, weight_z.bytes);
     multiexp_data.emplace_back(tmp, proof8_S);
-    PERF_TIMER_STOP(VERIFY_line_62);
+    PERF_TIMER_STOP_BP(VERIFY_line_62);
 
     // Compute the number of rounds for the inner product
     const size_t rounds = pd.logM+logN;
@@ -958,7 +964,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
         sc_mul(w_cache[s-1].bytes, w_cache[s/2].bytes, winv[j].bytes);
       }
     }
-    PERF_TIMER_STOP(VERIFY_line_24_25_precalc);
+    PERF_TIMER_STOP_BP(VERIFY_line_24_25_precalc);
 
     for (size_t i = 0; i < MN; ++i)
     {
@@ -1004,7 +1010,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
       }
     }
 
-    PERF_TIMER_STOP(VERIFY_line_24_25);
+    PERF_TIMER_STOP_BP(VERIFY_line_24_25);
 
     // PAPER LINE 26
     PERF_TIMER_START_BP(VERIFY_line_26_new);
@@ -1021,7 +1027,7 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
     sc_mulsub(tmp.bytes, proof.a.bytes, proof.b.bytes, proof.t.bytes);
     sc_mul(tmp.bytes, tmp.bytes, pd.x_ip.bytes);
     sc_muladd(z3.bytes, tmp.bytes, weight_z.bytes, z3.bytes);
-    PERF_TIMER_STOP(VERIFY_line_26_new);
+    PERF_TIMER_STOP_BP(VERIFY_line_26_new);
   }
 
   // now check all proofs at once
@@ -1037,13 +1043,13 @@ bool bulletproof_VERIFY(const std::vector<const Bulletproof*> &proofs)
   }
   if (!(multiexp(multiexp_data, 2 * maxMN) == rct::identity()))
   {
-    PERF_TIMER_STOP(VERIFY_step2_check);
+    PERF_TIMER_STOP_BP(VERIFY_step2_check);
     MERROR("Verification failure");
     return false;
   }
-  PERF_TIMER_STOP(VERIFY_step2_check);
+  PERF_TIMER_STOP_BP(VERIFY_step2_check);
 
-  PERF_TIMER_STOP(VERIFY);
+  PERF_TIMER_STOP_BP(VERIFY);
   return true;
 }
 

From 74fb3d882c9a50798795367ca8cd5dac85138c78 Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Mon, 22 Oct 2018 11:56:45 +0000
Subject: [PATCH 16/16] multiexp: some minor speedups

---
 src/ringct/multiexp.cc | 48 ++++++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/src/ringct/multiexp.cc b/src/ringct/multiexp.cc
index 85b1dfed4..6f77fed34 100644
--- a/src/ringct/multiexp.cc
+++ b/src/ringct/multiexp.cc
@@ -473,10 +473,7 @@ rct::key straus(const std::vector<MultiexpData> &data, const std::shared_ptr<str
 #endif
   for (size_t j = 0; j < data.size(); ++j)
   {
-    unsigned char bytes33[33];
-    memcpy(bytes33,  data[j].scalar.bytes, 32);
-    bytes33[32] = 0;
-    const unsigned char *bytes = bytes33;
+    const unsigned char *bytes = data[j].scalar.bytes;
 #if STRAUS_C==4
     unsigned int i;
     for (i = 0; i < 64; i += 2, bytes++)
@@ -485,6 +482,10 @@ rct::key straus(const std::vector<MultiexpData> &data, const std::shared_ptr<str
       digits[j*64+i+1] = bytes[0] >> 4;
     }
 #elif 1
+    unsigned char bytes33[33];
+    memcpy(bytes33,  data[j].scalar.bytes, 32);
+    bytes33[32] = 0;
+    bytes = bytes33;
     for (size_t i = 0; i < 256; ++i)
       digits[j*256+i] = ((bytes[i>>3] | (bytes[(i>>3)+1]<<8)) >> (i&7)) & mask;
 #else
@@ -615,7 +616,9 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<
   CHECK_AND_ASSERT_THROW_MES(c <= 9, "c is too large");
 
   ge_p3 result = ge_p3_identity;
+  bool result_init = false;
   std::unique_ptr<ge_p3[]> buckets{new ge_p3[1<<c]};
+  bool buckets_init[1<<9];
   std::shared_ptr<pippenger_cached_data> local_cache = cache == NULL ? pippenger_init_cache(data) : cache;
   std::shared_ptr<pippenger_cached_data> local_cache_2 = data.size() > cache_size ? pippenger_init_cache(data, cache_size) : NULL;
 
@@ -632,7 +635,7 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<
 
   for (size_t k = groups; k-- > 0; )
   {
-    if (!ge_p3_is_point_at_infinity(&result))
+    if (result_init)
     {
       ge_p2 p2;
       ge_p3_to_p2(&p2, &result);
@@ -646,8 +649,7 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<
           ge_p1p1_to_p2(&p2, &p1);
       }
     }
-    for (size_t i = 0; i < (1u<<c); ++i)
-      buckets[i] = ge_p3_identity;
+    memset(buckets_init, 0, 1u<<c);
 
     // partition scalars into buckets
     for (size_t i = 0; i < data.size(); ++i)
@@ -659,7 +661,7 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<
       if (bucket == 0)
         continue;
       CHECK_AND_ASSERT_THROW_MES(bucket < (1u<<c), "bucket overflow");
-      if (!ge_p3_is_point_at_infinity(&buckets[bucket]))
+      if (buckets_init[bucket])
       {
         if (i < cache_size)
           add(buckets[bucket], local_cache->cached[i]);
@@ -667,17 +669,37 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<
           add(buckets[bucket], local_cache_2->cached[i - cache_size]);
       }
       else
+      {
         buckets[bucket] = data[i].point;
+        buckets_init[bucket] = true;
+      }
     }
 
     // sum the buckets
-    ge_p3 pail = ge_p3_identity;
+    ge_p3 pail;
+    bool pail_init = false;
     for (size_t i = (1<<c)-1; i > 0; --i)
     {
-      if (!ge_p3_is_point_at_infinity(&buckets[i]))
-        add(pail, buckets[i]);
-      if (!ge_p3_is_point_at_infinity(&pail))
-        add(result, pail);
+      if (buckets_init[i])
+      {
+        if (pail_init)
+          add(pail, buckets[i]);
+        else
+        {
+          pail = buckets[i];
+          pail_init = true;
+        }
+      }
+      if (pail_init)
+      {
+        if (result_init)
+          add(result, pail);
+        else
+        {
+          result = pail;
+          result_init = true;
+        }
+      }
     }
   }