#!/usr/bin/env python3

data = (
  (511, # same CSIDH-512 prime but only 2^220 keys
   (3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,293,307,311,313,317,331,337,347,349,353,359,367,373,587),
   (2,3,4,4,5,5,5,5,5,7,7,8,7,6,1),
   (6,9,11,11,12,12,12,12,12,12,12,12,8,6,1),
  ),
  (512,
   (3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,293,307,311,313,317,331,337,347,349,353,359,367,373,587),
   (2,3,4,4,5,5,6,7,7,8,8,6,8,1),
   (10,14,16,17,17,17,18,18,18,18,18,13,13,1),
  ),
  (1024,
   (3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,509,521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,617,619,631,641,643,647,653,659,661,673,677,683,691,701,709,719,727,733,983),
   (2,3,5,4,6,6,6,6,6,7,7,7,6,7,7,5,6,5,10,3,10,5,1),
   (2,4,5,5,6,6,6,6,6,6,6,6,6,6,6,5,5,3,6,2,6,2,0)
  ),
  (2048,
   (3,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,509,521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,617,619,631,641,643,647,653,659,661,673,677,683,691,701,709,719,727,733,739,743,751,757,761,769,773,787,797,809,811,821,823,827,829,839,853,857,859,863,877,881,883,887,907,911,919,929,937,941,947,953,967,971,977,983,991,997,1009,1013,1019,1021,1031,1033,1039,1049,1051,1061,1063,1069,1087,1091,1093,1097,1103,1109,1117,1123,1129,1151,1153,1163,1171,1181,1187,1193,1201,1213,1217,1223,1229,1231,1237,1249,1259,1277,1279,1283,1289,1291,1297,1301,1303,1307,1319,1321,1327,1361,1367,1373,1381,1399,1409,1423,1427,1429,1433,1439,1447,1451,1453,1459,3413),
   (9,10,8,8,7,10,12,11,10,15,10,9,8,6,10,13,10,9,12,13,10,10,10,1),
   (1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,0)
  )
)

import os
import math
import re
import sys

import chain
import costs
import distmults
import sim

sys.setrecursionlimit(10000)

def maybeupdate(fn):
  if os.access(fn,os.F_OK):
    with open(fn) as f:
      x = f.read()
    with open(fn+'.tmp') as f:
      y = f.read()
    if x == y:
      os.unlink(fn+'.tmp')
      return
  os.rename(fn+'.tmp',fn)

def batchkeys(x,y):
  poly = [1]
  for i in range(x):
    newpoly = poly+[0]
    for j in range(len(poly)):
      newpoly[j+1] += poly[j]
    poly = newpoly
  for i in range(y):
    newpoly = poly+[0]
    for j in range(len(poly)):
      newpoly[j+1] += 2*poly[j]
    poly = newpoly
  return poly[x]

def dac_search(target,r0,r1,r2,chain,chainlen,best,bestlen):
  if chainlen >= bestlen: return best,bestlen
  if r2 > target: return best,bestlen
  if r2<<(bestlen-1-chainlen) < target: return best,bestlen
  if r2 == target: return chain,chainlen
  chain *= 2
  chainlen += 1
  best,bestlen = dac_search(target,r0,r2,r0+r2,chain+1,chainlen,best,bestlen)
  best,bestlen = dac_search(target,r1,r2,r1+r2,chain,chainlen,best,bestlen)
  return best,bestlen

def dac(target):
  best = None
  bestlen = 0
  while best == None:
    bestlen += 1
    best,bestlen = dac_search(target,1,2,3,0,0,best,bestlen)
  return best,bestlen

for bits,primes,batchsize,batchbound in data:
  fn = 'primes%d.c'%bits
  with open(fn+'.tmp','w') as f:
    f.write('// DO NOT EDIT! generated by ./autogen\n\n')
    f.write('#include "primes.h"\n')
    f.write('\n')
    keys = 1
    for s,b in zip(batchsize,batchbound):
      keys *= batchkeys(s,b)
    f.write('// number of keys: %d\n' % keys)
    f.write('// approximately 2^%f\n' % math.log(keys,2))
    f.write('\n')
   # XXX: As a matter of reproducability, we do not generate arbitrary timing
   # data and put it in a C file:
   #f.write('// average costs (calculated):\n')
   #x = distmults.average(primes,batchsize,batchbound)
   #f.write(costs.strstats(x,'// ','%.6f',primes,batchsize).strip()+'\n')
   #f.write('\n')

   #trials = 4096
   #f.write('// average costs (%d simulated trials):\n' % trials)
   #y = {}
   #for trial in range(trials):
   #  x = sim.trial(primes,batchsize,batchbound)
   #  for cost in x:
   #    if cost not in y: y[cost] = 0
   #    y[cost] += x[cost]
   #for cost in y: y[cost] *= 1.0/trials
   #f.write(costs.strstats(y,'// ','%.6f',primes,batchsize).strip()+'\n')
   #f.write('\n')

    f.write('const long long primes[primes_num] = {\n')
    f.write(' ')
    for l in primes:
      assert l < 2**63
      f.write(' %d,' % l)
    f.write('\n')
    f.write('};\n')
    f.write('\n')
    primesdac = [dac(l) for l in primes]
    f.write('const long long primes_dac[primes_num] = {\n')
    f.write(' ')
    for D in primesdac:
      assert D[0] < 2**63
      f.write(' %d,' % D[0])
    f.write('\n')
    f.write('};\n')
    f.write('\n')
    f.write('const long long primes_daclen[primes_num] = {\n')
    f.write(' ')
    for D in primesdac:
      assert D[1] < 2**63
      f.write(' %d,' % D[1])
    f.write('\n')
    f.write('};\n')
    f.write('\n')
    f.write('const long long primes_batchsize[primes_batches] = {\n')
    f.write(' ')
    for s in batchsize:
      f.write(' %d,' % s)
    f.write('\n')
    f.write('};\n')
    f.write('\n')
    f.write('const long long primes_batchstart[primes_batches] = {\n')
    f.write(' ')
    pos = 0
    for s in batchsize:
      f.write(' %d,' % pos)
      pos += s
    f.write('\n')
    f.write('};\n')
    f.write('\n')
    f.write('const long long primes_batchstop[primes_batches] = {\n')
    f.write(' ')
    pos = 0
    for s in batchsize:
      pos += s
      f.write(' %d,' % pos)
    f.write('\n')
    f.write('};\n')
    f.write('\n')
    f.write('const long long primes_batchmaxdaclen[primes_batches] = {\n')
    f.write(' ')
    pos = 0
    for s in batchsize:
      f.write(' %d,' % max(D[1] for D in primesdac[pos:pos+s]))
      pos += s
    f.write('\n')
    f.write('};\n')
    f.write('\n')
    f.write('const long long primes_batchbound[primes_batches] = {\n')
    f.write(' ')
    for s in batchbound:
      f.write(' %d,' % s)
    f.write('\n')
    f.write('};\n')
  maybeupdate(fn)

fn = 'primes.h'
with open(fn+'.tmp','w') as f:
  f.write('// DO NOT EDIT! generated by ./autogen\n\n')
  f.write('#ifndef primes_h\n')
  f.write('#define primes_h\n')
  f.write('\n')
  f.write('#include "primes_namespace.h"\n')
  f.write('\n')

  hashif = '#if'
  for bits,primes,batchsize,batchbound in data:
    f.write('%s BITS == %d\n' % (hashif,bits))
    f.write('#define primes_num %d\n' % len(primes))
    f.write('#define primes_batches %d\n' % len(batchsize))
    m = max(b+s for b,s in zip(batchbound,batchsize))
    f.write('#define primes_maxbatchboundplussize %d\n' % m)
    hashif = '#elif'

  f.write('#else\n')
  orbits = ' or '.join('%d'%bits for bits,primes,batchsize,batchbound in data)
  f.write('#error BITS must be %s\n' % orbits)
  f.write('#endif\n')
  f.write('\n')
  f.write('extern const long long primes[primes_num];\n')
  f.write('extern const long long primes_dac[primes_num];\n')
  f.write('extern const long long primes_daclen[primes_num];\n')
  f.write('extern const long long primes_batchsize[primes_batches];\n')
  f.write('extern const long long primes_batchstart[primes_batches];\n')
  f.write('extern const long long primes_batchstop[primes_batches];\n')
  f.write('extern const long long primes_batchbound[primes_batches];\n')
  f.write('extern const long long primes_batchmaxdaclen[primes_batches];\n')
  f.write('\n')
  f.write('#endif\n')
maybeupdate(fn)

def writeconst(f,n,limbs):
  assert n >= 0

  nquad = []
  while n:
    nquad += [n%(1<<64)]
    n >>= 64

  while len(nquad) < limbs:
    nquad += [0]
  assert len(nquad) == limbs

  while len(nquad) > 0:
    nquad4,nquad = nquad[:4],nquad[4:]
    f.write('    .quad %s\n' % ', '.join('%d'%x if x<10 else '0x%016x'%x for x in nquad4))

for bits,primes,batchsize,batchbound in data:
  p = 4
  for l in primes: p *= l
  p -= 1
  pbits = 0
  while (1<<pbits)<=p:
    pbits += 1
  pbytes = (pbits+7)//8
  plimbs = (pbytes+7)//8
  pmontbits = 64*plimbs

  sqrt16p = 1
  while True:
    if sqrt16p**2 <= 16*p and (sqrt16p+1)**2 > 16*p:
      break
    sqrt16p = (sqrt16p+(16*p)//sqrt16p)//2

  inv = 2**64-pow(p,2**62-1,2**64)

  invchain = chain.chain2(p-2)
  invchaincost = chain.cost2(invchain)

  fn = 'fp_inv%s.c'%bits
  with open(fn+'.tmp','w') as f:
    f.write('// DO NOT EDIT! generated by ./autogen\n\n')
    f.write('#include "fp.h"\n')
    f.write('\n')
    f.write('// %s mults, %s squarings\n' % invchaincost)
    f.write('void fp_inv(fp *x)\n')
    f.write('{\n')
    f.write(chain.code(invchain))
    f.write('}\n')
  maybeupdate(fn)

  sqrtchain = chain.chain2((p+1)//4)
  sqrtchaincost = chain.cost2(sqrtchain)

  fn = 'fp_sqrt%s.c'%bits
  with open(fn+'.tmp','w') as f:
    f.write('// DO NOT EDIT! generated by ./autogen\n\n')
    f.write('#include "fp.h"\n')
    f.write('\n')
    f.write('// %s mults, %s squarings\n' %
      (sqrtchaincost[0],sqrtchaincost[1]+1))
    f.write('long long fp_sqrt(fp *x)\n')
    f.write('{\n')
    f.write('  fp origx = *x;\n')
    f.write(chain.code(sqrtchain))
    f.write('  fp check; fp_sq2(&check,x);\n')
    f.write('  return fp_isequal(&check,&origx);\n')
    f.write('}\n')
  maybeupdate(fn)

  fn = 'uintbig_const_le_%s.s' % bits
  with open(fn+'.tmp','w') as f:
    f.write('/* DO NOT EDIT! generated by ./autogen */\n')
    f.write('\n')

    symbol = 'highctidh_%s_uintbig_1' % bits
    f.write('.global %s\n' % symbol)
    f.write('%s:\n' % symbol)
    writeconst(f,1,plimbs)
    f.write('    .size %s, %d\n' % (symbol, pbytes))
    f.write('    .type %s, @object\n' % symbol)
    f.write('\n')

    symbol = 'highctidh_%s_uintbig_p' % bits
    f.write('.global %s\n' % symbol)
    f.write('%s:\n' % symbol)
    writeconst(f,p,plimbs)
    f.write('    .size %s, %d\n' % (symbol, pbytes))
    f.write('    .type %s, @object\n' % symbol)
    f.write('\n')

    symbol = 'highctidh_%s_uintbig_four_sqrt_p' % bits
    f.write('.global %s\n' % symbol)
    f.write('%s:\n' % symbol)
    writeconst(f,sqrt16p,plimbs)
    f.write('    .size %s, %d\n' % (symbol, pbytes))
    f.write('    .type %s, @object\n' % symbol)
  maybeupdate(fn)

  for _arch in ["x86_64", "i86pc"]:
      fn = f"uintbig{bits}_{_arch}.S"
      with open(fn+'.tmp','w') as f:
        f.write('/* DO NOT EDIT! generated by ./autogen */\n')
        f.write('#if HIGHCTIDH_PORTABLE == 0\n\n')
        f.write('.intel_syntax noprefix\n')
        f.write('\n')
        f.write('//#include "uintbig_namespace.h"\n')
        f.write('\n')
        f.write('.section .rodata\n')
        f.write('\n')
        f.write('#include "uintbig_const_le_%s.s"\n' % bits)
        f.write('\n')
        f.write('.section .text\n')
        f.write('\n')
        symbol = 'highctidh_%s_uintbig_set' % bits
        f.write('.global %s\n' % symbol)
        f.write('%s:\n' % symbol)
        f.write('    cld\n')
        f.write('    mov rax, rsi\n')
        f.write('    stosq\n')
        f.write('    xor rax, rax\n')
        f.write('    mov rcx, %d\n' % (plimbs-1))
        f.write('    rep stosq\n')
        f.write('    ret\n')
        f.write('\n')
        f.write('\n')
        symbol = 'highctidh_%s_uintbig_bit' % bits
        f.write('.global %s\n' % symbol)
        f.write('%s:\n' % symbol)
        f.write('    mov rcx, rsi\n')
        f.write('    and rcx, 0x3f\n')
        f.write('    shr rsi, 6\n')
        f.write('    mov rax, [rdi + 8*rsi]\n')
        f.write('    shr rax, cl\n')
        f.write('    and rax, 1\n')
        f.write('    ret\n')
        f.write('\n')
        f.write('\n')
        symbol = 'highctidh_%s_uintbig_add3' % bits
        f.write('.global %s\n' % symbol)
        f.write('%s:\n' % symbol)
        f.write('    mov rax, [rsi +  0]\n')
        f.write('    add rax, [rdx +  0]\n')
        f.write('    mov [rdi +  0], rax\n')
        f.write('    .set k, 1\n')
        f.write('    .rept %d\n' % (plimbs-1))
        f.write('        mov rax, [rsi + 8*k]\n')
        f.write('        adc rax, [rdx + 8*k]\n')
        f.write('        mov [rdi + 8*k], rax\n')
        f.write('        .set k, k+1\n')
        f.write('    .endr\n')
        f.write('    setc al\n')
        f.write('    movzx rax, al\n')
        f.write('    ret\n')
        f.write('\n')
        symbol = 'highctidh_%s_uintbig_sub3' % bits
        f.write('.global %s\n' % symbol)
        f.write('%s:\n' % symbol)
        f.write('    mov rax, [rsi +  0]\n')
        f.write('    sub rax, [rdx +  0]\n')
        f.write('    mov [rdi +  0], rax\n')
        f.write('    .set k, 1\n')
        f.write('    .rept %d\n' % (plimbs-1))
        f.write('        mov rax, [rsi + 8*k]\n')
        f.write('        sbb rax, [rdx + 8*k]\n')
        f.write('        mov [rdi + 8*k], rax\n')
        f.write('        .set k, k+1\n')
        f.write('    .endr\n')
        f.write('    setc al\n')
        f.write('    movzx rax, al\n')
        f.write('    ret\n')
        f.write('\n')
        f.write('\n')
        symbol = 'highctidh_%s_uintbig_mul3_64' % bits
        f.write('.global %s\n' % symbol)
        f.write('%s:\n' % symbol)
        f.write('\n')
        f.write('    mulx r10, rax, [rsi +  0]\n')
        f.write('    mov [rdi +  0], rax\n')
        f.write('\n')
        for i in range(1,plimbs):
          if i&1:
            f.write('    mulx r11, rax, [rsi + %d]\n' % (8*i))
            if i == 1:
              f.write('    add  rax, r10\n')
            else:
              f.write('    adcx rax, r10\n')
            f.write('    mov [rdi + %d], rax\n' % (8*i))
          else:
            f.write('    mulx r10, rax, [rsi + %d]\n' % (8*i))
            f.write('    adcx rax, r11\n')
            f.write('    mov [rdi + %d], rax\n' % (8*i))
          f.write('\n')
        f.write('    ret\n\n')
        f.write('#endif\n')
      maybeupdate(fn)

  fn = 'fp_const_le_%s.s' % bits
  prefix = 'highctidh_%s_' % bits
  with open(fn+'.tmp','w') as f:
    f.write('/* DO NOT EDIT! generated by ./autogen */\n')
    f.write('\n')

    f.write('.hidden .%sinv_min_p_mod_r\n' % prefix)
    f.write('.%sinv_min_p_mod_r: /* -p^-1 mod 2^64 */\n' % prefix)
    writeconst(f,inv,1)
    f.write('\n')

    f.write('.global %sfp_0\n' % prefix)
    f.write('%sfp_0:\n' % prefix)
    f.write('    .zero %d\n' % pbytes)
    f.write('    .size %sfp_0, %d\n' % (prefix, pbytes))
    f.write('    .type %sfp_0, @object\n' % prefix)
    f.write('\n')

    f.write('.global %sfp_1\n' % prefix)
    f.write('%sfp_1: /* 2^%d mod p */\n' % (prefix, pmontbits))
    writeconst(f,(1<<pmontbits)%p,plimbs)
    f.write('    .size %sfp_1, %d\n' % (prefix, pbytes))
    f.write('    .type %sfp_1, @object\n' % prefix)
    f.write('\n')

    f.write('.global %sfp_2\n' % prefix)
    f.write('%sfp_2: /* 2^%d mod p */\n' % (prefix, pmontbits+1))
    writeconst(f,(1<<(pmontbits+1))%p,plimbs)
    f.write('    .size %sfp_2, %d\n' % (prefix, pbytes))
    f.write('    .type %sfp_2, @object\n' % prefix)
    f.write('\n')

    f.write('.hidden .%sr_squared_mod_p\n' % prefix)
    f.write('.%sr_squared_mod_p: /* (2^%d)^2 mod p */\n' % (prefix, pmontbits))
    writeconst(f,(1<<(2*pmontbits))%p,plimbs)
  maybeupdate(fn)

  for _arch in ["x86_64", "i86pc"]:
      fn = f"fp{bits}_{_arch}.S"
      prefix = 'highctidh_%s_' % bits
      with open(fn+'.tmp','w') as f:
        f.write('/* DO NOT EDIT! generated by ./autogen */\n')
        f.write('#if HIGHCTIDH_PORTABLE == 0\n\n')
        f.write('.intel_syntax noprefix\n')
        f.write('\n')
        f.write('//#include "uintbig_namespace.h"\n')
        f.write('//#include "fp_namespace.h"\n')
        f.write('\n')
        f.write('.section .rodata\n')
        f.write('\n')
        f.write('.set pbits,%d\n' % pbits)
        f.write('.set pbytes,%d\n' % pbytes)
        f.write('.set plimbs,%d\n' % plimbs)
        f.write('\n')
        f.write('.hidden .uintbig_p_local\n')
        f.write('.uintbig_p_local:\n')
        writeconst(f,p,plimbs)
        f.write('\n')

        f.write('#include "fp_const_le_%s.s"\n' % bits)
        f.write('\n')
        # f.write('.global p_minus_2\n')
        # f.write('p_minus_2:\n')
        # writeconst(f,p-2,plimbs)
        # f.write('\n')

        # f.write('.global p_minus_1_halves\n')
        # f.write('p_minus_1_halves:\n')
        # writeconst(f,(p-1)//2,plimbs)
        # f.write('\n')

        f.write('.section .text\n')
        f.write('.p2align 4,,15\n')
        f.write('\n')
        f.write('.global %sfp_copy\n' % prefix)
        f.write('%sfp_copy:\n' % prefix)
        f.write('    cld\n')
        f.write('    mov rcx, plimbs\n')
        f.write('    rep movsq\n')
        f.write('    ret\n')
        f.write('\n')
        f.write('.global %sfp_cmov\n' % prefix)
        f.write('%sfp_cmov:\n' % prefix)
        f.write('    movzx rax, dl\n')
        f.write('    neg rax\n')
        f.write('    .set k, 0\n')
        f.write('    .rept plimbs\n')
        f.write('        mov rcx, [rdi + 8*k]\n')
        f.write('        mov rdx, [rsi + 8*k]\n')
        f.write('\n')
        f.write('        xor rdx, rcx\n')
        f.write('        and rdx, rax\n')
        f.write('        xor rcx, rdx\n')
        f.write('\n')
        f.write('        mov [rdi + 8*k], rcx\n')
        f.write('\n')
        f.write('        .set k, k+1\n')
        f.write('    .endr\n')
        f.write('    ret\n')
        f.write('\n')
        f.write('.global %sfp_cswap\n' % prefix)
        f.write('%sfp_cswap:\n' % prefix)
        f.write('    movzx rax, dl\n')
        f.write('    neg rax\n')
        f.write('    .set k, 0\n')
        f.write('    .rept plimbs\n')
        f.write('        mov rcx, [rdi + 8*k]\n')
        f.write('        mov rdx, [rsi + 8*k]\n')
        f.write('\n')
        f.write('        mov r8, rcx\n')
        f.write('        xor r8, rdx\n')
        f.write('        and r8, rax\n')
        f.write('\n')
        f.write('        xor rcx, r8\n')
        f.write('        xor rdx, r8\n')
        f.write('\n')
        f.write('        mov [rdi + 8*k], rcx\n')
        f.write('        mov [rsi + 8*k], rdx\n')
        f.write('\n')
        f.write('        .set k, k+1\n')
        f.write('    .endr\n')
        f.write('    ret\n')
        f.write('\n')
        f.write('.%sreduce_once:\n' % prefix)
        f.write('    push rbp\n')
        if plimbs > 8:
          f.write('    sub rsp, %d\n' % (8*(plimbs-8)))
        f.write('    mov rbp, rdi\n')
        f.write('\n')

        regs = ('rdi','rsi','rdx','rcx','r8','r9','r10','r11')
        for i in range(plimbs):
          regi = regs[i%len(regs)]+', '
          if len(regi) < 5: regi += ' '
          f.write('    mov %s[rbp + %d]\n' % (regi,i*8))
          if i == 0:
            f.write('    sub %s[rip + .uintbig_p_local + %d]\n' % (regi,i*8))
          else:
            f.write('    sbb %s[rip + .uintbig_p_local + %d]\n' % (regi,i*8))
          if i < plimbs-8:
            f.write('    mov qword ptr [rsp + %d], %s\n' % (8*i,regs[i%len(regs)]))

        f.write('\n')
        f.write('    setnc al\n')
        f.write('    movzx rax, al\n')
        f.write('    neg rax\n')
        f.write('\n')
        f.write('.macro cswap2, r, m\n')
        f.write('    xor \\r, \\m\n')
        f.write('    and \\r, rax\n')
        f.write('    xor \\m, \\r\n')
        f.write('.endm\n')
        f.write('\n')

        for i in range(plimbs-8,plimbs):
          regi = regs[i%len(regs)]
          f.write('    cswap2 %s, [rbp + %d]\n' % (regi,8*i))

        for i in range(plimbs-8):
          regi = regs[i%len(regs)]
          f.write('    mov %s, [rsp + %d]\n' % (regs[i%len(regs)],8*i))
          f.write('    cswap2 %s, [rbp + %d]\n' % (regi,8*i))
        f.write('\n')

        if plimbs > 8:
          f.write('    add rsp, %d\n' % (8*(plimbs-8)))
        f.write('    pop rbp\n')
        f.write('    ret\n')
        f.write('\n')
        f.write('.global %sfp_add2\n' % prefix)
        f.write('%sfp_add2:\n' % prefix)
        f.write('    mov rdx, rdi\n')
        f.write('.global %sfp_add3\n' % prefix)
        f.write('%sfp_add3:\n' % prefix)
        f.write('    push rdi\n')
        f.write('    call %suintbig_add3\n' % prefix)
        f.write('    pop rdi\n')
        f.write('    jmp .%sreduce_once\n' % prefix)
        f.write('\n')
        f.write('.global %sfp_sub2\n' % prefix)
        f.write('%sfp_sub2:\n' % prefix)
        f.write('  mov rdx, rdi\n')
        f.write('  xchg rsi, rdx\n')
        f.write('.global %sfp_sub3\n' % prefix)
        f.write('%sfp_sub3:\n' % prefix)
        f.write('    push rdi\n')
        f.write('    call %suintbig_sub3\n' % prefix)
        f.write('    pop rdi\n')
        f.write('    neg rax\n')
        f.write('\n')
        f.write('    sub rsp, pbytes\n')
        f.write('\n')
        f.write('    mov rcx, [rip + .uintbig_p_local +  0]\n')
        f.write('    and rcx, rax\n')
        f.write('    mov [rsp + 0],rcx\n')
        f.write('    .set k, 1\n')
        f.write('    .rept plimbs-1\n')
        f.write('        mov rcx, [rip + .uintbig_p_local + 8*k]\n')
        f.write('        and rcx, rax\n')
        f.write('        mov [rsp + 8*k], rcx\n')
        f.write('        .set k, k+1\n')
        f.write('    .endr\n')
        f.write('\n')
        f.write('    mov rcx, [rsp +  0]\n')
        f.write('    add rcx, [rdi +  0]\n')
        f.write('    mov [rdi +  0], rcx\n')
        f.write('    .set k, 1\n')
        f.write('    .rept plimbs-1\n')
        f.write('        mov rcx, [rsp + 8*k]\n')
        f.write('        adc rcx, [rdi + 8*k]\n')
        f.write('        mov [rdi + 8*k], rcx\n')
        f.write('        .set k, k+1\n')
        f.write('    .endr\n')
        f.write('\n')
        f.write('    add rsp, pbytes\n')

        f.write('    ret\n')
        f.write('\n')
        f.write('\n')
        f.write('/* Montgomery arithmetic */\n')
        f.write('\n')
        f.write('.global %sfp_mul2\n' % prefix)
        f.write('%sfp_mul2:\n' % prefix)
        f.write('  mov rdx, rdi\n')

        f.write('.global %sfp_mul3\n' % prefix)
        f.write('%sfp_mul3:\n' % prefix)
        f.write('    push rbp\n')
        f.write('    push rbx\n')

        if plimbs == 8:
          f.write('    push r12\n')
          f.write('    push r13\n')
          f.write('    push r14\n')
          f.write('    push r15\n')
          f.write('\n')
          f.write('    push rdi\n')
          f.write('\n')
          f.write('    mov rdi, rsi\n')
          f.write('    mov rsi, rdx\n')
          f.write('\n')
          f.write('    xor r8,  r8\n')
          f.write('    xor r9,  r9\n')
          f.write('    xor r10, r10\n')
          f.write('    xor r11, r11\n')
          f.write('    xor r12, r12\n')
          f.write('    xor r13, r13\n')
          f.write('    xor r14, r14\n')
          f.write('    xor r15, r15\n')
          f.write('    xor rbp, rbp\n')
          f.write('\n')
          f.write('    /* flags are already cleared */\n')
          f.write('\n')
          f.write('.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8\n')
          f.write('\n')
          f.write('    mov rdx, [rsi +  0]\n')
          f.write('    mulx rcx, rdx, [rdi + 8*\\k]\n')
          f.write('    add rdx, \\r0\n')
          f.write('    mulx rcx, rdx, [rip + .%sinv_min_p_mod_r]\n' % prefix)
          f.write('\n')
          f.write('    xor rax, rax /* clear flags */\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rip + .uintbig_p_local +  0]\n')
          f.write('    adox \\r0, rax\n')
          f.write('\n')
          f.write('    mulx rcx, rax, [rip + .uintbig_p_local +  8]\n')
          f.write('    adcx \\r1, rbx\n')
          f.write('    adox \\r1, rax\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rip + .uintbig_p_local + 16]\n')
          f.write('    adcx \\r2, rcx\n')
          f.write('    adox \\r2, rax\n')
          f.write('\n')
          f.write('    mulx rcx, rax, [rip + .uintbig_p_local + 24]\n')
          f.write('    adcx \\r3, rbx\n')
          f.write('    adox \\r3, rax\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rip + .uintbig_p_local + 32]\n')
          f.write('    adcx \\r4, rcx\n')
          f.write('    adox \\r4, rax\n')
          f.write('\n')
          f.write('    mulx rcx, rax, [rip + .uintbig_p_local + 40]\n')
          f.write('    adcx \\r5, rbx\n')
          f.write('    adox \\r5, rax\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rip + .uintbig_p_local + 48]\n')
          f.write('    adcx \\r6, rcx\n')
          f.write('    adox \\r6, rax\n')
          f.write('\n')
          f.write('    mulx rcx, rax, [rip + .uintbig_p_local + 56]\n')
          f.write('    adcx \\r7, rbx\n')
          f.write('    adox \\r7, rax\n')
          f.write('\n')
          f.write('    mov rax, 0\n')
          f.write('    adcx \\r8, rcx\n')
          f.write('    adox \\r8, rax\n')
          f.write('\n')
          f.write('\n')
          f.write('    mov rdx, [rdi + 8*\\k]\n')
          f.write('\n')
          f.write('    xor rax, rax /* clear flags */\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rsi +  0]\n')
          f.write('    adox \\r0, rax\n')
          f.write('\n')
          f.write('    mulx rcx, rax, [rsi +  8]\n')
          f.write('    adcx \\r1, rbx\n')
          f.write('    adox \\r1, rax\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rsi + 16]\n')
          f.write('    adcx \\r2, rcx\n')
          f.write('    adox \\r2, rax\n')
          f.write('\n')
          f.write('    mulx rcx, rax, [rsi + 24]\n')
          f.write('    adcx \\r3, rbx\n')
          f.write('    adox \\r3, rax\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rsi + 32]\n')
          f.write('    adcx \\r4, rcx\n')
          f.write('    adox \\r4, rax\n')
          f.write('\n')
          f.write('    mulx rcx, rax, [rsi + 40]\n')
          f.write('    adcx \\r5, rbx\n')
          f.write('    adox \\r5, rax\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rsi + 48]\n')
          f.write('    adcx \\r6, rcx\n')
          f.write('    adox \\r6, rax\n')
          f.write('\n')
          f.write('    mulx rcx, rax, [rsi + 56]\n')
          f.write('    adcx \\r7, rbx\n')
          f.write('    adox \\r7, rax\n')
          f.write('\n')
          f.write('    mov rax, 0\n')
          f.write('    adcx \\r8, rcx\n')
          f.write('    adox \\r8, rax\n')
          f.write('\n')
          f.write('.endm\n')
          f.write('\n')
          f.write('    MULSTEP 0, r8,  r9,  r10, r11, r12, r13, r14, r15, rbp\n')
          f.write('    MULSTEP 1, r9,  r10, r11, r12, r13, r14, r15, rbp, r8\n')
          f.write('    MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8,  r9\n')
          f.write('    MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8,  r9,  r10\n')
          f.write('    MULSTEP 4, r12, r13, r14, r15, rbp, r8,  r9,  r10, r11\n')
          f.write('    MULSTEP 5, r13, r14, r15, rbp, r8,  r9,  r10, r11, r12\n')
          f.write('    MULSTEP 6, r14, r15, rbp, r8,  r9,  r10, r11, r12, r13\n')
          f.write('    MULSTEP 7, r15, rbp, r8,  r9,  r10, r11, r12, r13, r14\n')
          f.write('\n')
          f.write('    pop rdi\n')
          f.write('\n')
          f.write('    mov [rdi +  0], rbp\n')
          f.write('    mov [rdi +  8], r8\n')
          f.write('    mov [rdi + 16], r9\n')
          f.write('    mov [rdi + 24], r10\n')
          f.write('    mov [rdi + 32], r11\n')
          f.write('    mov [rdi + 40], r12\n')
          f.write('    mov [rdi + 48], r13\n')
          f.write('    mov [rdi + 56], r14\n')
          f.write('\n')
          f.write('    pop r15\n')
          f.write('    pop r14\n')
          f.write('    pop r13\n')
          f.write('    pop r12\n')
        else:
          f.write('\n')
          f.write('  sub rsp,%d\n' % (8*plimbs+16))
          f.write('  mov [rsp+%d],rdi\n' % (8*plimbs+8))
          f.write('  mov rdi,rsi\n')
          f.write('  mov rsi,rdx\n')
          f.write('\n')
          f.write('  /* XXX: put directly into output */\n')
          f.write('  xor rax,rax\n')
          for i in range(plimbs+1):
            f.write('  mov [rsp+%d],rax\n' % (8*i))
          f.write('\n')
          f.write('.macro MULSTEP, k, %s\n' % ', '.join('I%d' % i for i in range(plimbs+1)))
          f.write('\n')
          f.write('    mov r11,[rsp+\\I0]\n')
          f.write('    mov rdx, [rsi +  0]\n')
          f.write('    mulx rcx, rdx, [rdi + 8*\\k]\n')
          f.write('    add rdx, r11\n')
          f.write('    mulx rcx, rdx, [rip + .%sinv_min_p_mod_r]\n' % prefix)
          f.write('\n')
          f.write('    xor rax, rax /* clear flags */\n')
          f.write('\n')
          f.write('    mulx rbx, rax, [rip + .uintbig_p_local +  0]\n')
          f.write('    adox r11, rax\n')
          f.write('    mov [rsp+\\I0],r11\n')
          f.write('\n')

          for i in range(1,plimbs):
            f.write('    mov r11,[rsp+\\I%d]\n' % i)
            if i&1:
              f.write('    mulx rcx, rax, [rip + .uintbig_p_local + %d]\n' % (8*i))
              f.write('    adcx r11, rbx\n')
              f.write('    adox r11, rax\n')
            else:
              f.write('    mulx rbx, rax, [rip + .uintbig_p_local + %d]\n' % (8*i))
              f.write('    adcx r11, rcx\n')
              f.write('    adox r11, rax\n')
            f.write('    mov [rsp+\\I%d],r11\n' % i)
            f.write('\n')

          f.write('    mov r11,[rsp+\\I%d]\n' % plimbs)
          f.write('    mov rax, 0\n')
          if plimbs&1:
            f.write('    adcx r11, rbx\n')
            f.write('    adox r11, rax\n')
          else:
            f.write('    adcx r11, rcx\n')
            f.write('    adox r11, rax\n')
          f.write('    mov [rsp+\\I%d],r11\n' % plimbs)
          f.write('\n')

          f.write('    mov rdx, [rdi + 8*\\k]\n')
          f.write('\n')
          f.write('    xor rax, rax /* clear flags */\n')
          f.write('\n')
          f.write('    mov r11,[rsp+\\I0]\n')
          f.write('    mulx rbx, rax, [rsi +  0]\n')
          f.write('    adox r11, rax\n')
          f.write('    mov [rsp+\\I0],r11\n')
          f.write('\n')

          for i in range(1,plimbs):
            f.write('    mov r11,[rsp+\\I%d]\n' % i)
            if i&1:
              f.write('    mulx rcx, rax, [rsi + %d]\n' % (8*i))
              f.write('    adcx r11, rbx\n')
              f.write('    adox r11, rax\n')
            else:
              f.write('    mulx rbx, rax, [rsi + %d]\n' % (8*i))
              f.write('    adcx r11, rcx\n')
              f.write('    adox r11, rax\n')
            f.write('    mov [rsp+\\I%d],r11\n' % i)
            f.write('\n')

          f.write('    mov r11,[rsp+\\I%d]\n' % plimbs)
          f.write('    mov rax, 0\n')
          if plimbs&1:
            f.write('    adcx r11, rbx\n')
            f.write('    adox r11, rax\n')
          else:
            f.write('    adcx r11, rcx\n')
            f.write('    adox r11, rax\n')
          f.write('    mov [rsp+\\I%d],r11\n' % plimbs)
          f.write('\n')

          f.write('.endm\n')
          f.write('\n')

          indices = ['8*(k+1)']
          indices += ['(8*(k+%d))%%%d' % (j, 8*(plimbs+1)) for j in range(2,plimbs+1)]
          indices += ['8*k']
          f.write('    .set k, 0\n')
          f.write('    .rept plimbs\n')
          f.write('        MULSTEP k, %s\n' % ', '.join(indices))
          f.write('        .set k, k+1\n')
          f.write('    .endr\n')
          f.write('\n')
          f.write('    mov rdi,[rsp+%d]\n' % (8*plimbs+8))
          f.write('\n')

          for i in range(plimbs):
            f.write('    mov r11,[rsp+%d]\n' % (8*i))
            f.write('    mov [rdi+%d],r11\n' % (8*i))
          f.write('\n')
          f.write('    add rsp,%d\n' % (8*plimbs+16))
          f.write('\n')

        f.write('    pop rbx\n')
        f.write('    pop rbp\n')
        f.write('    jmp .%sreduce_once\n' % prefix)
        f.write('\n')

        f.write('.global %sfp_sq1\n' % prefix)
        f.write('%sfp_sq1:\n' % prefix)
        f.write('    mov rsi, rdi\n')
        f.write('.global %sfp_sq2\n' % prefix)
        f.write('%sfp_sq2:\n' % prefix)
        f.write('    /* TODO implement optimized Montgomery squaring */\n')
        f.write('    mov rdx, rsi\n')
        f.write('    jmp %sfp_mul3\n\n' % prefix)
        f.write('#endif\n')

      maybeupdate(fn)


cctest = '\t$(CC) -D\'NAMESPACEGENERIC(x)=highctidh_##x\' \\\n'
ccgeneric = '\t$(CC) -D\'NAMESPACEGENERIC(x)=highctidh_##x\' \\\n'
ccbits = '\t$(CC) -DBITS={0} -D\'NAMESPACEBITS(x)=highctidh_{0}_##x\' -D\'NAMESPACEGENERIC(x)=highctidh_##x\' \\\n'
ccbits += '\t\t$(CFLAGS) \\\n'
scc = '\t$(SCC) -DBITS={0} -D\'NAMESPACEBITS(x)=highctidh_{0}_##x\' -D\'NAMESPACEGENERIC(x)=highctidh_##x\' \\\n'

def includes(fn):
  result = [fn]
  with open(fn) as f:
    for line in f:
      m = re.match(r'\s*#\s*include\s*"([^"]*)"',line)
      if m == None: continue
      for i in includes(m.group(1)):
        if i not in result:
          result += [i]
  return result

databits = [bits for bits,primes,batchsize,batchbound in data]

fn = 'GNUmakefile'
with open(fn+'.tmp','w') as f:
  f.write('# DO NOT EDIT! generated by ./autogen\n\n')
  f.write('.PHONY: clean\n')
  f.write('HOST_PLATFORM := $(shell uname -m)\n')
  f.write('HOST_OS := $(shell uname -s)\n\n')
  f.write('### override these for cross-building:\n')
  f.write('PLATFORM ?= $(shell uname -m)\n')
  f.write('ifeq\t($(PLATFORM), $(filter $(PLATFORM), arm64 aarch64 amd64 i86pc loongarch64 mips64 mips64el ppc64 ppc64le riscv64 s390x sparc sparc4v sparc64 x86_64))\n')
  f.write('PLATFORM_SIZE ?= 64\n')
  f.write('else ifeq ($(PLATFORM), $(filter $(PLATFORM), armv7l i386 i686 mips mipsel))\n')
  f.write('PLATFORM_SIZE ?= 32\n')
  f.write('else\n')
  f.write('PLATFORM_SIZE ?= $(shell getconf LONG_BIT)\n')
  f.write('endif\n\n')
  f.write('ifndef PLATFORM_SIZE\n')
  f.write('\t$(error "Set PLATFORM_SIZE to 32 or 64 and try again")\n')
  f.write('endif\n\n')
  f.write('ifeq\t(HOST_PLATFORM,PLATFORM)\n')
  f.write('CC_MARCH ?= native\n')
  f.write('# Default to "march=native -mtune=native" when not cross-building\n')
  f.write('# https://lemire.me/blog/2018/07/25/it-is-more-complicated-than-i-thought-mtune-march-in-gcc/\n')
  f.write('endif\n')
  f.write('CC_MTUNE ?= $(CC_MARCH)\n\n')
  f.write('SEC_CFLAGS := -Wformat -Werror=format-security -D_FORTIFY_SOURCE=2 -fstack-protector-strong\n')
  f.write('BASE_CFLAGS := -fpie -fPIC -Wall -Wextra -pedantic -O2 -fwrapv -DGETRANDOM -Werror\n')
  f.write('BASE_CFLAGS+=-DPLATFORM=${PLATFORM} -DPLATFORM_SIZE=${PLATFORM_SIZE}\n')
  f.write('BASE_CFLAGS+=$(SEC_CFLAGS)\n')
  f.write('BASE_CFLAGS+=$(if $(strip $(CC_MARCH)),-march=$(CC_MARCH) -mtune=$(CC_MTUNE),)\n')
  f.write('ifeq    ($(HOST_OS), $(filter $(HOST_OS), OpenBSD, DragonFly, Darwin))\n')
  f.write('BASE_CFLAGS+=-D__$(HOST_OS)__\n')
  f.write('endif\n')
  f.write('LDFLAGS :=\n')
  f.write('ifeq ($(HOST_OS),Linux)\n')
  f.write('LDFLAGS+=-Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now\n')
  f.write('LDFLAGS+=-Wl,-Bsymbolic-functions\n')
  f.write('LDFLAGS+=-Wl,--reduce-memory-overheads -Wl,--no-keep-memory\n')
  f.write('endif\n')
  f.write('ifeq ($(CC),clang)\n')
  f.write('BASE_CFLAGS+=-Wno-unused-command-line-argument\n')
  f.write('endif\n')
  f.write('ifeq ($(CC),gcc)\n')
  f.write('BASE_CFLAGS+=\n')
  f.write('endif\n')
  f.write('ifeq ($(CC),emcc)\n')
  f.write('BASE_CFLAGS+=-Wno-unused-command-line-argument\n')
  f.write('LDFLAGS=-s SIDE_MODULE=1\n')
  f.write('endif\n')
  f.write('ifeq ($(WINDOWS),1)\n')
  f.write('BASE_CFLAGS :=-Wall -Wextra -pedantic -O2 -fwrapv -D__Windows__\n')
  f.write('HIGHCTIDH_PORTABLE ?=1\n')
  f.write('BASE_CFLAGS+=-DPLATFORM=${PLATFORM} -DPLATFORM_SIZE=${PLATFORM_SIZE}\n')
  f.write('ifeq ($(CC),x86_64-w64-mingw32-gcc)\n')
  f.write('BASE_CFLAGS+=-fpie -fPIC\n')
  f.write('CFLAGS = $(BASE_CFLAGS)\n')
  f.write('endif\n')
  f.write('ifeq ($(CC),clang)\n')
  f.write('HIGHCTIDH_PORTABLE ?=1\n')
  f.write('BASE_CFLAGS+=-DPLATFORM=${PLATFORM} -DPLATFORM_SIZE=${PLATFORM_SIZE}\n')
  f.write('CFLAGS = $(BASE_CFLAGS) -Wno-unused-command-line-argument\n')
  f.write('LDFLAGS = -LAdvapi32.dll\n')
  f.write('endif\n')
  f.write('endif\n')
  f.write('# Default to using fiat crypto, safe and portable but slow backend\n')
  f.write('HIGHCTIDH_PORTABLE ?= 1\n')
  f.write('ifeq\t($(HIGHCTIDH_PORTABLE),1)\n')
  f.write('BASE_CFLAGS += -DHIGHCTIDH_PORTABLE=1\n')
  f.write('\tlibhighctidh_511_OBJS := fp2fiat511.o fiat_p511.o\n')
  f.write('\tlibhighctidh_512_OBJS := fp2fiat512.o fiat_p512.o\n')
  f.write('\tlibhighctidh_1024_OBJS := fp2fiat1024.o fiat_p1024.o\n')
  f.write('\tlibhighctidh_2048_OBJS := fp2fiat2048.o fiat_p2048.o\n')
  f.write('else\n')
  f.write('BASE_CFLAGS += -DHIGHCTIDH_PORTABLE=0\n')
  f.write('ifeq ($(PLATFORM),amd64)\n')
  f.write('ASM_PLATFORM := x86_64\n')
  f.write('else ifeq ($(PLATFORM),arm64)\n')
  f.write('ASM_PLATFORM := aarch64\n')
  f.write('else\n')
  f.write('ASM_PLATFORM := $(PLATFORM)\n')
  f.write('endif\n')
  f.write('ifeq ($(ASM_PLATFORM), aarch64)\n')
  f.write('ASFLAGS := -march=armv8-a+sve2\n')
  f.write('endif\n')
  f.write('\tlibhighctidh_511_OBJS := uintbig511.o fp511.o\n')
  f.write('\tlibhighctidh_512_OBJS := uintbig512.o fp512.o\n')
  f.write('\tlibhighctidh_1024_OBJS := uintbig1024.o fp1024.o\n')
  f.write('\tlibhighctidh_2048_OBJS := uintbig2048.o fp2048.o\n')
  f.write('endif\n')
  f.write('\n')
  f.write('ifeq\t($(PLATFORM), $(filter $(PLATFORM), arm64 aarch64))\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS)\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),armv7l)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS) -D__ARM32__\n')
  f.write('\tifeq ($(CC),clang)\n')
  f.write('\t\tCFLAGS+= -fforce-enable-int128\n')
  f.write('\tendif\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),loongarch64)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS)\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),mips)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS) -D__MIPS__\n')
  f.write('\tifeq ($(CC),clang)\n')
  f.write('\t\tCFLAGS+= -fforce-enable-int128\n')
  f.write('\tendif\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),mips64)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS) -D__MIPS64__\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),mips64el)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS) -D__MIPS64el__\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),ppc64le)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS)\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),ppc64)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS)\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),riscv64)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS) -D__riscv\n')
  f.write('endif\n\n')
  f.write('ifeq\t($(PLATFORM),s390x)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS) -D__s390x__\n')
  f.write('\tifeq ($(CC),clang)\n')
  f.write('\t\tCFLAGS+= -march=z10 -mtune=z10\n')
  f.write('\tendif\n')
  f.write('endif\n\n')
  f.write('ifeq   ($(HOST_OS), $(filter $(HOST_OS), SunOS))\n')
  f.write('ifeq   ($(PLATFORM), $(filter $(PLATFORM), sparc sun4u sun4v sparc64 i86pc i386))\n')
  f.write('\tBASE_CFLAGS+=$(SEC_CFLAGS)\n')
  f.write('\tCFLAGS+=$(BASE_CFLAGS) -m64 -D__sun\n')
  f.write('\tLDFLAGS+=$(LDFLAGS)\n')
  f.write('ifeq   ($(PLATFORM), $(filter $(PLATFORM), i86pc))\n')
  f.write('\tCFLAGS+=-D__i86pc__\n')
  f.write('\tifeq ($(CC),gcc)\n')
  f.write('\t\tCFLAGS+=-mimpure-text -Wno-attributes\n')
  f.write('\tendif\n')
  f.write('endif\n')
  f.write('ifeq   ($(PLATFORM), $(filter $(PLATFORM), i386))\n')
  f.write('\tCFLAGS+=-D__i386__\n')
  f.write('\tifeq ($(CC),gcc)\n')
  f.write('\t\tCFLAGS+=-mimpure-text\n')
  f.write('\tendif\n')
  f.write('endif\n')
  f.write('ifeq   ($(PLATFORM), $(filter $(PLATFORM), sparc sun4u sun4v sparc64))\n')
  f.write('\tCFLAGS+=-DHIGHCTIDH_PORTABLE=1 -D__sun4v__\n')
  f.write('\tifeq ($(CC),gcc)\n')
  f.write('\t\tCFLAGS+=-mcpu=native -mtune=native -D_REENTRANT\n')
  f.write('\tendif\n')
  f.write('endif\n')
  f.write('endif\n')
  f.write('endif\n')
  f.write('ifeq\t($(PLATFORM),unknown)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS)\n')
  f.write('endif\n\n')
  f.write('ifneq\t($(HOST_OS), $(filter $(HOST_OS), SunOS))\n')
  f.write('ifeq\t($(PLATFORM), $(filter $(PLATFORM), amd64 x86_64 i386 i686))\n')
  f.write('ifeq\t($(PLATFORM_SIZE),32)\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS) -D__i386__\n')
  f.write('\tifeq ($(CC),clang)\n')
  f.write('\t\tCFLAGS+= -fforce-enable-int128\n')
  f.write('\tendif\n')
  f.write('else\n\n')
  f.write('\tCFLAGS+= $(BASE_CFLAGS) -D__x86_64__\n\n')
  f.write('endif\n')
  f.write('endif\n')
  f.write('endif\n\n')
  f.write('SCC ?= $(CC) $(CFLAGS) $(LDFLAGS)\n')
  f.write('CC += $(CFLAGS) $(LDFLAGS)\n')
  f.write('\n')
  f.write('default: libhighctidh.so\n')
  f.write('\n')
  f.write('all: default timecop generic %s\n' % ' '.join('%d'%bits for bits in databits))
  f.write('\n')
  f.write('generic: testrandom.out\n')
  f.write('\n')
  for bits in databits:
    f.write('{0}: costpoly{0}.out checkct{0}untuned bench{0}untuned test{0}.out \\\n'.format(bits))
    f.write('checkct{0}mults bench{0}mults \\\n'.format(bits))
    f.write('checkct{0}cycles bench{0}cycles \\\n'.format(bits))
    f.write('ubench{0} umults{0}\n'.format(bits))
    f.write('\n')
  f.write('timecop: \\\n')
  for bits in databits:
    f.write('checkct{0}untuned checkct{0}mults checkct{0}cycles \\\n'.format(bits))
  f.write('\n')
  for bits in databits:
    f.write('\tvalgrind ./checkct{0}untuned\n'.format(bits))
    f.write('\tvalgrind ./checkct{0}mults\n'.format(bits))
    f.write('\tvalgrind ./checkct{0}cycles\n'.format(bits))
  f.write('\n')

  f.write('# ----- benchmarks:\n')
  f.write('\n')

  for app in ('checkct','bench'):
    for bits in databits:
      f.write('{1}{0}cycles: {1}.c \\\n'.format(bits,app))
      f.write('libhighctidh_{0}.a libhighctidh_base.a libhighctidh_tunecycles{0}.a libtest.a\n'.format(bits))
      f.write(ccbits.format(bits))
      f.write('\t\t-o {1}{0}cycles {1}.c \\\n'.format(bits,app))
      f.write('\t\tlibhighctidh_{0}.a libhighctidh_base.a libhighctidh_tunecycles{0}.a libtest.a\n'.format(bits))
      f.write('\n')
    for bits in databits:
      f.write('{1}{0}mults: {1}.c \\\n'.format(bits,app))
      f.write('libhighctidh_{0}.a libhighctidh_base.a libtest.a\n'.format(bits))
      f.write(ccbits.format(bits))
      f.write('\t\t-o {1}{0}mults {1}.c \\\n'.format(bits,app))
      f.write('\t\tlibhighctidh_{0}.a libhighctidh_base.a libtest.a\n'.format(bits))
      f.write('\n')
    for bits in databits:
      f.write('{1}{0}untuned: {1}.c \\\n'.format(bits,app))
      f.write('libhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
      f.write(ccbits.format(bits))
      f.write('\t\t-o {1}{0}untuned {1}.c \\\n'.format(bits,app))
      f.write('\t\tlibhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
      f.write('\n')

  f.write('# ----- uintbig tests require HIGHCTIDH_PORTABLE=0 and never run automatically:\n')
  f.write('\n')

  for bits in databits:
    f.write('testuintbig{0}: test.c uintbig{0}_$(ASM_PLATFORM).S\n'.format(bits))
    f.write(ccbits.format(bits))
    f.write('\t\t-DTEST_UINTBIG \\\n')
    f.write('\t\t-o $@ $^\n')
    f.write('\n')

  f.write('# ----- tests run automatically:\n')
  f.write('\n')

  f.write('testrandom.out: testrandom\n')
  f.write('\t[ -f testrandom.time ] && cat testrandom.time || :\n')
  f.write('\ttime ./testrandom > testrandom.out\n')
  f.write('\tcmp testrandom.out testrandom.exp\n')
  f.write('\n')
  f.write('testrandom: testrandom.o libhighctidh_base.a libtest.a\n')
  f.write('\t$(CC) -o testrandom testrandom.o libhighctidh_base.a libtest.a\n')
  f.write('\n')
  f.write('testrandom.o: testrandom.c random.h GNUmakefile\n')
  f.write(ccgeneric)
  f.write('\t\t-c testrandom.c\n')
  f.write('\n')

  for bits in databits:
    f.write('test{0}: test.c \\\n'.format(bits))
    f.write('libhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
    f.write(ccbits.format(bits))
    f.write('\t\t-o test{0} test.c \\\n'.format(bits))
    f.write('\t\tlibhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
    f.write('\n')
  for bits in databits:
    f.write('test{0}.out: test{0} test{0}.exp\n'.format(bits))
    f.write('\t[ -f test{0}.time ] && cat test{0}.time || :\n'.format(bits))
    f.write('\ttime ./test{0} > test{0}.out\n'.format(bits))
    f.write('\tcmp test{0}.out test{0}.exp\n'.format(bits))
    f.write('\n')

  f.write('# ----- microbenchmarks (some run automatically):\n')
  f.write('\n')

  for bits in databits:
    f.write('costpoly{0}.out: costpoly{0}\n'.format(bits))
    f.write('\t./costpoly{0} > costpoly{0}.out\n'.format(bits))
    f.write('\tcmp costpoly{0}.out costpoly.py\n'.format(bits))
    f.write('\n')
  for bits in databits:
    f.write('costpoly{0}: costpoly.c \\\n'.format(bits))
    f.write('libhighctidh_{0}.a libhighctidh_base.a libtest.a\n'.format(bits))
    f.write(ccbits.format(bits))
    f.write('\t\t-o costpoly{0} costpoly.c \\\n'.format(bits))
    f.write('\t\tlibhighctidh_{0}.a libhighctidh_base.a libtest.a\n'.format(bits))
    f.write('\n')

  for bits in databits:
    f.write('umults{0}: umults.c \\\n'.format(bits))
    f.write('libhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
    f.write(ccbits.format(bits))
    f.write('\t\t-o umults{0} umults.c \\\n'.format(bits))
    f.write('\t\tlibhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
    f.write('\n')
  for bits in databits:
    f.write('ubench{0}: ubench.c \\\n'.format(bits))
    f.write('libhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
    f.write(ccbits.format(bits))
    f.write('\t\t-o ubench{0} ubench.c \\\n'.format(bits))
    f.write('\t\tlibhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
    f.write('\n')

  f.write('# ----- libhighctidh_tunecycles:\n')
  f.write('\n')

  for bits in databits:
    f.write('libhighctidh_tunecycles{0}.a: steps_tunecycles{0}.o GNUmakefile\n'.format(bits))
    f.write('\t$(AR) crs libhighctidh_tunecycles{0}.a steps_tunecycles{0}.o\n'.format(bits))
    f.write('\n')
  for bits in databits:
    f.write('steps_tunecycles{0}.o: steps_tunecycles{0}.c steps.h GNUmakefile\n'.format(bits))
    f.write(ccgeneric)
    f.write('\t\t-c steps_tunecycles{0}.c\n'.format(bits))
    f.write('\n')
  for bits in databits:
    f.write('steps_tunecycles{0}.c: tunecycles{0}.out tune2c GNUmakefile\n'.format(bits))
    f.write('\t./tune2c < tunecycles{0}.out > steps_tunecycles{0}.c\n'.format(bits))
    f.write('\n')
  for bits in databits:
    f.write('tunecycles{0}.out: tunecycles{0} GNUmakefile\n'.format(bits))
    f.write('\t[ -f tunecycles{0}.time ] && cat tunecycles{0}.time || :\n'.format(bits))
    f.write('\ttime ./tunecycles{0} > tunecycles{0}.out\n'.format(bits))
    f.write('\n')
  for bits in databits:
    f.write('tunecycles{0}: tunecycles.c \\\n'.format(bits))
    f.write('libhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
    f.write(ccbits.format(bits))
    f.write('\t\t-o tunecycles{0} tunecycles.c \\\n'.format(bits))
    f.write('\t\tlibhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
    f.write('\n')


  f.write('# ----- libhighctidh_{%s}, size-dependent functions:\n' % ','.join(str(bits) for bits in databits))
  f.write('\n')
  for bits in databits:
    f.write('libhighctidh_{0}.a: $(libhighctidh_{0}_OBJS) fp_inv{0}.o fp_sqrt{0}.o primes{0}.o poly{0}.o mont{0}.o elligator{0}.o skgen{0}.o validate{0}.o csidh{0}.o GNUmakefile\n'.format(bits))
    f.write('\t$(AR) crs libhighctidh_{0}.a $(libhighctidh_{0}_OBJS) fp_inv{0}.o fp_sqrt{0}.o primes{0}.o poly{0}.o mont{0}.o elligator{0}.o skgen{0}.o validate{0}.o csidh{0}.o\n'.format(bits))
    f.write('\n')

  # The big unified static lib:
  f.write('libhighctidh.a: crypto_classify.o crypto_declassify.o randombytes.o random.o int32_sort.o steps.o steps_untuned.o')
  for bits in databits:
    f.write(' csidh{0}.o elligator{0}.o fiat_p{0}.o fp2fiat{0}.o'.format(bits))
    f.write(' fp_inv{0}.o fp_sqrt{0}.o mont{0}.o poly{0}.o'.format(bits))
    f.write(' primes{0}.o skgen{0}.o validate{0}.o'.format(bits))
  f.write('\n') # end of libhighctidh.a dep list
  f.write('\t$(AR) rcs libhighctidh.a $^\n\n')

  for c in ('csidh','validate','skgen','elligator','mont','poly'):
    for bits in databits:
      ch = ' '.join(includes(c+'.c'))
      f.write('{1}{0}.o: {2} GNUmakefile\n'.format(bits,c,ch))
      f.write(ccbits.format(bits))
      f.write('\t\t-o {1}{0}.o -c {1}.c\n'.format(bits,c))
      f.write('\n')

  for c in ('fp_inv','fp_sqrt','primes'):
    for bits in databits:
      ch = ' '.join(includes('%s%s.c'%(c,bits)))
      f.write('{1}{0}.o: {2} GNUmakefile\n'.format(bits,c,ch))
      f.write(ccbits.format(bits))
      f.write('\t\t-c {1}{0}.c\n'.format(bits,c))
      f.write('\n')

  for bits in databits:
    f.write('fp{0}.o: fp{0}_$(ASM_PLATFORM).S fp_const_le_{0}.s fp.h fp_namespace.h uintbig.h uintbig_namespace.h GNUmakefile\n'.format(bits))
    f.write(scc.format(bits))
    f.write('\t\t$(ASFLAGS) \\\n')
    f.write('\t\t-o $@ -c $<\n')
    f.write('\n')

  for bits in databits:
    f.write('uintbig{0}.o: uintbig{0}_$(ASM_PLATFORM).S uintbig_const_le_{0}.s uintbig.h uintbig_namespace.h GNUmakefile\n'.format(bits))
    f.write(scc.format(bits))
    f.write('\t\t-o $@ -c $<\n')
    f.write('\n')

  for bits in databits:
    f.write('fiat_p{0}.o: fiat_p{0}.c fiat_p{0}.h\n'.format(bits))
    f.write('\t$(SCC) -c fiat_p{0}.c -o fiat_p{0}.o\n'.format(bits))
    f.write('\n')

  for bits in databits:
    f.write('fp2fiat{0}.o: fp2fiat.c fiat_p{0}.h fiat_p{0}.o fp.h fp_namespace.h uintbig.h uintbig_namespace.h GNUmakefile\n'.format(bits))
    f.write('\t$(SCC) -DBITS={0} -D\'NAMESPACEBITS(x)=highctidh_{0}_##x\' -D\'NAMESPACEGENERIC(x)=highctidh_##x\' \\\n'.format(bits))
    f.write('\t-c fp2fiat.c -o fp2fiat{0}.o\n'.format(bits))
    f.write('\n')

  f.write('# ----- libhighctidh_untuned, size-independent but normally replaced by tuned functions:\n')
  f.write('\n')
  f.write('libhighctidh_untuned.a: steps_untuned.o GNUmakefile\n')
  f.write('\t$(AR) crs libhighctidh_untuned.a steps_untuned.o\n')
  f.write('\n')
  f.write('steps_untuned.o: steps_untuned.c steps.h GNUmakefile\n')
  f.write(ccgeneric)
  f.write('\t\t-c steps_untuned.c\n')
  f.write('\n')

  f.write('# ----- libhighctidh_base, size-independent functions:\n')
  f.write('\n')
  f.write('libhighctidh_base.a: steps.o random.o GNUmakefile\n')
  f.write('\t$(AR) crs libhighctidh_base.a steps.o random.o\n')
  f.write('\n')
  f.write('steps.o: steps.c steps.h GNUmakefile\n')
  f.write(ccgeneric)
  f.write('\t\t-c steps.c\n')
  f.write('\n')
  f.write('random.o: random.c random.h int32_sort.h randombytes.h GNUmakefile\n')
  f.write(ccgeneric)
  f.write('\t\t-c random.c\n')
  f.write('\n')

  f.write('# ----- functions that libhighctidh wants from a core crypto library:\n')
  f.write('\n')
  f.write('libtest.a: crypto_classify.o crypto_declassify.o randombytes.o int32_sort.o GNUmakefile\n')
  f.write('\t$(AR) crs libtest.a crypto_classify.o crypto_declassify.o randombytes.o int32_sort.o\n')
  f.write('\n')

  f.write('randombytes.o: randombytes.c randombytes.h GNUmakefile\n')
  f.write(cctest)
  f.write('\t\t-c randombytes.c\n')
  f.write('\n')
  f.write('int32_sort.o: int32_sort.c int32_sort.h GNUmakefile\n')
  f.write(cctest)
  f.write('\t\t-c int32_sort.c\n')
  f.write('\n')
  f.write('crypto_declassify.o: crypto_declassify.c crypto_declassify.h GNUmakefile\n')
  f.write(cctest)
  f.write('\t\t-c crypto_declassify.c\n')
  f.write('\n')
  f.write('crypto_classify.o: crypto_classify.c crypto_classify.h GNUmakefile\n')
  f.write(cctest)
  f.write('\t\t-c crypto_classify.c\n')

  f.write('\n\n# ----- shared library targets for libhighctidh_*.so:\n')

  for bits in databits:
      f.write('\n\nlibhighctidh_{0}.so: libhighctidh_{0}.a libhighctidh_base.a libhighctidh_untuned.a libtest.a\n'.format(bits))
      f.write('\t\t$(SCC) -DBITS={0} -D\'NAMESPACEBITS(x)=highctidh_{0}_##x\' -D\'NAMESPACEGENERIC(x)=highctidh_##x\' -shared -o libhighctidh_{0}.so $(libhighctidh_{0}_OBJS) fp_inv{0}.o fp_sqrt{0}.o primes{0}.o poly{0}.o mont{0}.o elligator{0}.o skgen{0}.o validate{0}.o steps.o random.o crypto_classify.o crypto_declassify.o randombytes.o int32_sort.o  steps_untuned.o csidh{0}.o\n'.format(bits))
      f.write('\n\ntest{0}-dyn: test{0} libhighctidh_{0}.so\n'.format(bits))
      f.write('\t\t$(CC) -DBITS={0} -D\'NAMESPACEBITS(x)=highctidh_{0}_##x\' -D\'NAMESPACEGENERIC(x)=highctidh_##x\' -o test{0}-dyn test.c -l highctidh_{0} -L .\n'.format(bits))

  f.write('\n\nlibhighctidh.so: libhighctidh_511.so libhighctidh_512.so libhighctidh_1024.so libhighctidh_2048.so\n')
  f.write('\t\tls -l libhighctidh_*.so\n\n')
  f.write('highctidh.wasm: libhighctidh.so\n')
  f.write('\texport LDFLAGS="-s SIDE_MODULE=1"\n')
  f.write('\t$(CC) -v --no-entry -O1 *.o -o dist/highctidh.wasm\n')
  f.write('\tls -al dist/highctidh.wasm\n\n')
  f.write('setup-examples: csidh.h\n')
  f.write('\t\t-mkdir -p libhighctidh\n')
  f.write('\t\t-cp csidh.h libhighctidh/\n\n')
  for bits in databits:
      f.write('example{0}: setup-examples libhighctidh_{0}.so\n'.format(bits))
      f.write('\t$(CC) -DBITS={0} -D\'NAMESPACEBITS(x)=highctidh_{0}_##x\' \\\n'.format(bits))
      f.write('\t-D\'NAMESPACEGENERIC(x)=highctidh_##x\' -o example-ctidh{0} example-ctidh.c \\\n'.format(bits))
      f.write('\t-L. -I. -l highctidh_{0}\n\n'.format(bits))

  f.write('examples: example511 example512 example1024 example2048\n')
  f.write('\t\tls -l example-ctidh511 example-ctidh512 example-ctidh1024 example-ctidh2048\n\n')

  f.write('examples-static: examples_static.c libhighctidh.a *.h\n')
  f.write('\t$(CC) -static -Wall -Werror -Wpedantic examples_static.c -Wl,-Bstatic -L. -l:libhighctidh.a -o examples-static\n\n')

  f.write('test: clean libhighctidh.so testrandom test511 test512 test1024 test2048\n')
  f.write('\t\t./test.sh\n\n')

  f.write('DESTDIR ?= /usr/local\n')
  f.write('install: libhighctidh.so libhighctidh.a\n')
  f.write('\t\tinstall -d $(DESTDIR)/include/libhighctidh/\n')
  f.write('\t\tinstall -d $(DESTDIR)/lib/\n')
  f.write('\t\tinstall -v *.h $(DESTDIR)/include/libhighctidh/\n')
  f.write('\t\tinstall -v libhighctidh_*.so $(DESTDIR)/lib/\n')
  f.write('\t\tinstall -v libhighctidh.a $(DESTDIR)/lib/\n')

  f.write('\n\nclean:\n')
  f.write('\t\t-rm -f *.a *.o *.out *.so\n')
  f.write('\t\t-rm -f bench1024cycles bench1024mults bench1024untuned bench2048cycles bench2048mults bench2048untuned bench511cycles bench511mults bench511untuned bench512cycles bench512mults bench512untuned\n')
  f.write('\t\t-rm -f checkct1024cycles checkct1024mults checkct1024untuned checkct2048cycles checkct2048mults checkct2048untuned checkct511cycles checkct511mults checkct511untuned checkct512cycles checkct512mults checkct512untuned costpoly1024 costpoly2048 costpoly511 example-ctidh1024 example-ctidh2048\n')
  f.write('\t\t-rm -f example-ctidh511 example-ctidh512 test1024 test2048 test511 test511 test512 test512-dyn test*-dyn testuintbig1024 testuintbig2048 testuintbig511 testuintbig512 testrandom tunecycles1024 tunecycles2048 tunecycles511 tunecycles512 ubench1024 ubench2048 ubench511 ubench512 umults1024 umults2048 umults511 umults512\n')
  f.write('\t\t-rm -rf UNKNOWN.egg-info/ __pycache__/\n')
  f.write('\t\t-rm -rf build/ deb_dist/ dist/\n')
  f.write('\t\t-rm -rf highctidh.egg-info/ highctidh/__pycache__/\n')
  f.write('\t\t-rm -rf tests/__pycache__/\n')
  f.write('\t\t-rm -rf docker_build_output/\n')
  f.write('\t\t-rm -rf libhighctidh/\n')
  f.write('\t\t-rm -rf examples-static\n')
  f.write('\t\t-rm -rf highctidh-*.tar.gz\n')

maybeupdate(fn)
