From df7978ca2e8469c79d5e5584fb486633095d336a Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Mon, 6 May 2024 15:22:45 +0200
Subject: [PATCH 01/12] Initial nfloat module

---
 Makefile.in                |    1 +
 doc/source/index.rst       |    1 +
 doc/source/nfloat.rst      |  270 +++++++++
 src/gr.h                   |    1 +
 src/gr/test_ring.c         |    2 +-
 src/nfloat.h               |  307 +++++++++++
 src/nfloat/ctx.c           |  212 ++++++++
 src/nfloat/dot.c           |  377 +++++++++++++
 src/nfloat/inlines.c       |   14 +
 src/nfloat/nfloat.c        | 1052 ++++++++++++++++++++++++++++++++++++
 src/nfloat/test/main.c     |   28 +
 src/nfloat/test/t-nfloat.c |  166 ++++++
 12 files changed, 2430 insertions(+), 1 deletion(-)
 create mode 100644 doc/source/nfloat.rst
 create mode 100644 src/nfloat.h
 create mode 100644 src/nfloat/ctx.c
 create mode 100644 src/nfloat/dot.c
 create mode 100644 src/nfloat/inlines.c
 create mode 100644 src/nfloat/nfloat.c
 create mode 100644 src/nfloat/test/main.c
 create mode 100644 src/nfloat/test/t-nfloat.c
diff --git a/Makefile.in b/Makefile.in
index c948a21458..c0b49a775d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -152,6 +152,7 @@ HEADER_DIRS :=                                                              \
         long_extras                                                         \
         perm                                                                \
         double_extras   d_vec           d_mat                               \
+        nfloat                                                              \
         mpn_extras                                                          \
         mpfr_vec                        mpfr_mat                            \
         nmod            nmod_vec        nmod_mat        nmod_poly           \
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 906090a8d0..a2012745a9 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -189,6 +189,7 @@ Real and complex numbers
    issues.rst
    examples_arb.rst
    mag.rst
+   nfloat.rst
    arf.rst
    acf.rst
    arb.rst
diff --git a/doc/source/nfloat.rst b/doc/source/nfloat.rst
new file mode 100644
index 0000000000..bc372457d8
--- /dev/null
+++ b/doc/source/nfloat.rst
@@ -0,0 +1,270 @@
+.. _nfloat:
+
+**nfloat.h** -- packed floating-point numbers with n-word precision
+===============================================================================
+
+This module provides binary floating-point numbers in a flat representation
+with precision in small fixed multiples of the word size
+(64, 128, 192, 256, ... bits on a 64-bit machine). The exponent range is
+close to a full word.
+A number with `n`-limb precision is stored using `n+2` limbs as follows:
+
+    +---------------+
+    | exponent limb |
+    +---------------+
+    |   sign limb   |
+    +---------------+
+    |  mantissa[0]  |
+    +---------------+
+    |      ...      |
+    +---------------+
+    | mantissa[n-1] |
+    +---------------+
+
+This type has the advantage that floating-point numbers with the
+same precision can be packed together tightly in vectors
+and created on the stack without heap allocation.
+The precision of an ``nfloat`` context object and its elements cannot
+be changed; to switch precision, one must convert to a different
+context object. For higher precision than supported by ``nfloat``
+and for calculations that require fine-grained precision adjustments,
+one should use :type:`arf_t` instead.
+
+The focus is on fast calculation, not bitwise-defined results.
+Atomic operations typically give slightly worse than correct rounding,
+e.g. with 1-2 ulp error. The rounding is not guaranteed to be identical
+on 64-bit and 32-bit machines.
+Planned features include:
+
+* Directed rounding modes
+* Support for special values (partially implemented)
+* Optional (but slower) IEEE 754-like semantics
+* Complex and ball types
+
+This module is designed to use the :ref:`generics <gr>` interface.
+As such, the domain is represented by a :type:`gr_ctx_t` context object,
+methods return status flags (``GR_SUCCESS``, ``GR_UNABLE``, ``GR_DOMAIN``),
+and one can use generic structures such as :type:`gr_poly_t` for
+polynomials and :type:`gr_mat_t` for matrices.
+
+Types, macros and constants
+-------------------------------------------------------------------------------
+
+.. macro :: NFLOAT_MIN_LIMBS
+            NFLOAT_MAX_LIMBS
+
+    The number of limbs `n` permitted as precision. The current
+    limits are are `1 \le n \le 33` on a 64-bit machine and
+    `1 \le n \le 66` on a 32-bit machine, permitting precision
+    up to 2112 bits. The upper limit exists so that elements and
+    temporary buffers are safe to allocate on the stack and so that
+    simple operations like swapping are not too expensive.
+
+.. type:: nfloat_ptr
+          nfloat_srcptr
+
+    Pointer to an ``nfloat`` element or vector of elements of any
+    precision. Since this is a void type, one must cast to the correct
+    size before doing pointer arithmetic, e.g. via the ``GR_ENTRY``
+    macro.
+
+.. type:: nfloat64_struct
+          nfloat128_struct
+          nfloat192_struct
+          nfloat256_struct
+          nfloat384_struct
+          nfloat512_struct
+          nfloat1024_struct
+          nfloat2048_struct
+          nfloat64_t
+          nfloat128_t
+          nfloat192_t
+          nfloat256_t
+          nfloat384_t
+          nfloat512_t
+          nfloat1024_t
+          nfloat2048_t
+
+    For convenience we define types of the correct structure size for
+    some common levels of bit precision. An ``nfloatX_t`` is defined as
+    a length-one array of ``nfloatX_struct``, permitting it to be
+    passed by reference.
+
+    Sample usage:
+
+    .. code-block:: c
+
+        gr_ctx_t ctx;
+        nfloat256_t x, y;
+
+        nfloat_ctx_init(ctx, 256, 0);   /* precision must match the type */
+        gr_init(x, ctx);
+        gr_init(y, ctx);
+
+        gr_ctx_println(ctx);
+
+        GR_MUST_SUCCEED(gr_set_ui(x, 5, ctx));
+        GR_MUST_SUCCEED(gr_set_ui(y, 7, ctx));
+        GR_MUST_SUCCEED(gr_div(x, x, y, ctx));
+        GR_MUST_SUCCEED(gr_println(x, ctx));
+
+        gr_clear(x, ctx);
+        gr_clear(y, ctx);
+        gr_ctx_clear(ctx);
+
+.. macro:: NFLOAT_HEADER_LIMBS
+           NFLOAT_EXP(x)
+           NFLOAT_SGNBIT(x)
+           NFLOAT_D(x)
+           NFLOAT_DATA(x)
+
+.. macro:: NFLOAT_MAX_ALLOC
+
+.. macro:: NFLOAT_MIN_EXP
+           NFLOAT_MAX_EXP
+
+.. macro:: NFLOAT_EXP_ZERO
+           NFLOAT_EXP_POS_INF
+           NFLOAT_EXP_NEG_INF
+           NFLOAT_EXP_NAN
+           NFLOAT_IS_SPECIAL(x)
+           NFLOAT_IS_ZERO(x)
+           NFLOAT_IS_POS_INF(x)
+           NFLOAT_IS_NEG_INF(x)
+           NFLOAT_IS_INF(x)
+           NFLOAT_IS_NAN(x)
+
+Context objects
+-------------------------------------------------------------------------------
+
+.. function:: int nfloat_ctx_init(gr_ctx_t ctx, slong prec, int flags)
+
+    Initializes *ctx* to represent a domain of floating-point numbers
+    with bit precision *prec* rounded up to a full word
+    (for example, ``prec = 53`` actually creates a domain with
+    64-bit precision).
+
+    Returns ``GR_UNABLE`` without initializating the context object
+    if the given precision is too large to be supported, otherwise
+    returns ``GR_SUCCESS``.
+
+    Admissible flags are listed below.
+
+.. macro:: NFLOAT_ALLOW_UNDERFLOW
+
+    By default, operations that would underflow the exponent range
+    output a garbage value and return ``GR_UNABLE``.
+    Setting this flag allows such operations to
+    output zero and return ``GR_SUCCESS`` instead.
+
+.. macro:: NFLOAT_ALLOW_INF
+
+    Allow creation of infinities.
+    By default, operations that would overflow the exponent range
+    output a garbage value and return ``GR_UNABLE`` or ``GR_DOMAIN``.
+    Setting this flag allows such operations to
+    output an infinity and return ``GR_SUCCESS`` instead.
+
+.. macro:: NFLOAT_ALLOW_NAN
+
+    Allow creation of NaNs.
+    By default, operations that are meaningless
+    output a garbage value and return ``GR_UNABLE`` or ``GR_DOMAIN``.
+    Setting this flag allows such operations to
+    output NaN and return ``GR_SUCCESS`` instead.
+
+Infinities and NaNs are disabled by default to improve performance,
+as this allows certain functions to skip checks for such values.
+
+Basic operations and arithmetic
+-------------------------------------------------------------------------------
+
+Basic functionality for the ``gr`` method table.
+These methods are interchangeable with their ``gr`` counterparts.
+
+.. function:: int nfloat_ctx_write(gr_stream_t out, gr_ctx_t ctx)
+
+.. function:: void nfloat_init(nfloat_ptr res, gr_ctx_t ctx)
+
+    Initializes *res* to the zero element.
+
+.. function:: void nfloat_clear(nfloat_ptr res, gr_ctx_t ctx)
+
+    Since ``nfloat`` elements do no allocation, this is a no-op.
+
+.. function:: void nfloat_swap(nfloat_ptr x, nfloat_ptr y, gr_ctx_t ctx)
+
+.. function:: int nfloat_set(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+
+.. function:: truth_t nfloat_equal(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+
+.. function:: int nfloat_ctx_set_real_prec(gr_ctx_t ctx, slong prec)
+
+    Since ``nfloat`` contexts do not allow variable precision,
+    this does nothing and returns ``GR_UNABLE``.
+
+.. function:: int nfloat_ctx_get_real_prec(slong * res, gr_ctx_t ctx)
+
+    Sets *res* to the precision in bits and returns ``GR_SUCCESS``.
+
+.. function:: int nfloat_zero(nfloat_ptr res, gr_ctx_t ctx)
+              int nfloat_one(nfloat_ptr res, gr_ctx_t ctx)
+              int nfloat_neg_one(nfloat_ptr res, gr_ctx_t ctx)
+              int nfloat_pos_inf(nfloat_ptr res, gr_ctx_t ctx)
+              int nfloat_neg_inf(nfloat_ptr res, gr_ctx_t ctx)
+              int nfloat_nan(nfloat_ptr res, gr_ctx_t ctx)
+
+.. function:: truth_t nfloat_is_zero(nfloat_srcptr x, gr_ctx_t ctx)
+              truth_t nfloat_is_one(nfloat_srcptr x, gr_ctx_t ctx)
+              truth_t nfloat_is_neg_one(nfloat_srcptr x, gr_ctx_t ctx)
+
+.. function:: int nfloat_set_ui(nfloat_ptr res, ulong x, gr_ctx_t ctx)
+              int nfloat_set_si(nfloat_ptr res, slong x, gr_ctx_t ctx)
+              int nfloat_set_fmpz(nfloat_ptr res, const fmpz_t x, gr_ctx_t ctx)
+
+.. function:: int _nfloat_set_mpn_2exp(nfloat_ptr res, mp_srcptr x, mp_size_t xn, slong exp, int xsgnbit, gr_ctx_t ctx)
+              int nfloat_set_mpn_2exp(nfloat_ptr res, mp_srcptr x, mp_size_t xn, slong exp, int xsgnbit, gr_ctx_t ctx)
+
+.. function:: int nfloat_set_arf(nfloat_ptr res, const arf_t x, gr_ctx_t ctx)
+              int nfloat_get_arf(arf_t res, nfloat_srcptr x, gr_ctx_t ctx)
+
+.. function:: int nfloat_write(gr_stream_t out, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_randtest(nfloat_ptr res, flint_rand_t state, gr_ctx_t ctx)
+
+.. function:: int nfloat_cmp(int * res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+              int nfloat_cmpabs(int * res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+
+.. function:: int nfloat_neg(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_abs(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+              int nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+              int nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+
+.. function:: int nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+              int nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
+              int nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
+
+.. function:: int nfloat_sgn(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_im(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+
+Vector functions
+-------------------------------------------------------------------------------
+
+Overrides for generic ``gr`` vector operations with inlined or partially inlined
+code for reduced overhead.
+
+.. function:: int _nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
+              int _nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
+
+Internal functions
+-------------------------------------------------------------------------------
+
+.. function:: int _nfloat_underflow(nfloat_ptr res, int sgnbit, gr_ctx_t ctx)
+              int _nfloat_overflow(nfloat_ptr res, int sgnbit, gr_ctx_t ctx)
+
+.. function:: int _nfloat_cmp(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+              int _nfloat_cmpabs(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+              int _nfloat_add_1(nfloat_ptr res, mp_limb_t x0, slong xexp, int xsgnbit, mp_limb_t y0, slong delta, gr_ctx_t ctx)
+              int _nfloat_sub_1(nfloat_ptr res, mp_limb_t x0, slong xexp, int xsgnbit, mp_limb_t y0, slong delta, gr_ctx_t ctx)
+              int _nfloat_add_n(nfloat_ptr res, mp_srcptr xd, slong xexp, int xsgnbit, mp_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx)
+              int _nfloat_sub_n(nfloat_ptr res, mp_srcptr xd, slong xexp, int xsgnbit, mp_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx)
diff --git a/src/gr.h b/src/gr.h
index 7b0519e728..d5624cbc97 100644
--- a/src/gr.h
+++ b/src/gr.h
@@ -730,6 +730,7 @@ typedef enum
     GR_CTX_COMPLEX_EXTENDED_CA,
     GR_CTX_RR_ARB, GR_CTX_CC_ACB,
     GR_CTX_REAL_FLOAT_ARF, GR_CTX_COMPLEX_FLOAT_ACF,
+    GR_CTX_NFLOAT,
     GR_CTX_FMPZ_POLY, GR_CTX_FMPQ_POLY, GR_CTX_GR_POLY,
     GR_CTX_FMPZ_MPOLY, GR_CTX_GR_MPOLY,
     GR_CTX_FMPZ_MPOLY_Q,
diff --git a/src/gr/test_ring.c b/src/gr/test_ring.c
index fa22499548..16036c9d16 100644
--- a/src/gr/test_ring.c
+++ b/src/gr/test_ring.c
@@ -1354,7 +1354,7 @@ gr_test_zero_one(gr_ctx_t R, flint_rand_t state, int test_flags)
     equal = gr_is_one(a, R);
     if (status == GR_SUCCESS && equal == T_FALSE)
     {
-        flint_printf("FAILL is_one\n");
+        flint_printf("FAIL is_one\n");
         gr_ctx_println(R);
         gr_println(a, R);
         status = GR_TEST_FAIL;
diff --git a/src/nfloat.h b/src/nfloat.h
new file mode 100644
index 0000000000..eff8a3b894
--- /dev/null
+++ b/src/nfloat.h
@@ -0,0 +1,307 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#ifndef NFLOAT_H
+#define NFLOAT_H
+
+#ifdef NFLOAT_INLINES_C
+#define NFLOAT_INLINE
+#else
+#define NFLOAT_INLINE static inline
+#endif
+
+#include "flint.h"
+#include "mpn_extras.h"
+#include "gr.h"
+#include "gr_poly.h"
+#include "gr_mat.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The upper limit is so that temporary buffers are safe to allocate on
+   the stack and so that simple operations like swapping are not too
+   expensive compared to a pointer-and-size representation. */
+#if FLINT_BITS == 64
+#define NFLOAT_MIN_LIMBS 1
+#define NFLOAT_MAX_LIMBS 33
+#else
+#define NFLOAT_MIN_LIMBS 1
+#define NFLOAT_MAX_LIMBS 66
+#endif
+
+/* Number of header limbs used to encode sign + exponent. We use a
+   whole limb for the sign bit to avoid the overhead of bit fiddling. */
+#define NFLOAT_HEADER_LIMBS 2
+#define NFLOAT_EXP(x) (((mp_limb_signed_t *) x)[0])
+#define NFLOAT_SGNBIT(x) (((mp_ptr) x)[1])
+#define NFLOAT_D(x) (((mp_ptr) x) + NFLOAT_HEADER_LIMBS)
+#define NFLOAT_DATA(x) (((mp_ptr) x))
+
+/* Limbs needed to hold a temporary element of any precision. */
+#define NFLOAT_MAX_ALLOC (NFLOAT_HEADER_LIMBS + NFLOAT_MAX_LIMBS)
+
+/* The exponent limits should not be too close to the word boundary,
+   so that we can add two exponents plus some adjustmenets
+   and safely intercept overflow/underflow. */
+#define NFLOAT_MIN_EXP (WORD_MIN / 4)
+#define NFLOAT_MAX_EXP (WORD_MAX / 4)
+
+/* Special values (including zero). */
+/* TODO: is it better to encode special values in the exponent or in
+         the sign field? */
+/* TODO: do we want signed zero (optionally)? */
+#define NFLOAT_EXP_ZERO      WORD_MIN
+#define NFLOAT_EXP_POS_INF   (WORD_MIN+1)
+#define NFLOAT_EXP_NEG_INF   (WORD_MIN+2)
+#define NFLOAT_EXP_NAN       (WORD_MIN+3)
+#define NFLOAT_IS_SPECIAL(x) (NFLOAT_EXP(x) < NFLOAT_MIN_EXP)
+#define NFLOAT_IS_ZERO(x)    (NFLOAT_EXP(x) == NFLOAT_EXP_ZERO)
+#define NFLOAT_IS_POS_INF(x) (NFLOAT_EXP(x) == NFLOAT_EXP_POS_INF)
+#define NFLOAT_IS_NEG_INF(x) (NFLOAT_EXP(x) == NFLOAT_EXP_NEG_INF)
+#define NFLOAT_IS_INF(x)     (NFLOAT_IS_POS_INF(x) || NFLOAT_IS_NEG_INF(x))
+#define NFLOAT_IS_NAN(x)     (NFLOAT_EXP(x) == NFLOAT_EXP_NAN)
+
+/* Context flags to indicate whether we want to allow special values
+   or return GR_DOMAIN / GR_UNABLE. */
+#define NFLOAT_ALLOW_UNDERFLOW 1
+#define NFLOAT_ALLOW_INF       2
+#define NFLOAT_ALLOW_NAN       4
+
+typedef struct
+{
+    slong nlimbs;
+    int flags;
+    int rnd;    /* Allow rounding modes? Currently unused. */
+}
+_nfloat_ctx_struct;
+
+typedef void * nfloat_ptr;
+typedef const void * nfloat_srcptr;
+
+#define NFLOAT_CTX(ctx) ((_nfloat_ctx_struct *)(ctx))
+#define NFLOAT_CTX_NLIMBS(ctx) (NFLOAT_CTX(ctx)->nlimbs)
+#define NFLOAT_CTX_PREC(ctx) ((NFLOAT_CTX(ctx)->nlimbs) * FLINT_BITS)
+#define NFLOAT_CTX_FLAGS(ctx) (NFLOAT_CTX(ctx)->flags)
+#define NFLOAT_CTX_RND(ctx) (NFLOAT_CTX(ctx)->rnd)
+#define NFLOAT_CTX_DATA_NLIMBS(ctx) (NFLOAT_CTX_NLIMBS(ctx) + NFLOAT_HEADER_LIMBS)
+#define NFLOAT_CTX_HAS_INF_NAN(ctx) ((NFLOAT_CTX_FLAGS(ctx) & (NFLOAT_ALLOW_INF | NFLOAT_ALLOW_NAN)) != 0)
+
+typedef struct { mp_limb_t head[NFLOAT_HEADER_LIMBS]; mp_limb_t d[64 / FLINT_BITS]; } nfloat64_struct;
+typedef struct { mp_limb_t head[NFLOAT_HEADER_LIMBS]; mp_limb_t d[128 / FLINT_BITS]; } nfloat128_struct;
+typedef struct { mp_limb_t head[NFLOAT_HEADER_LIMBS]; mp_limb_t d[192 / FLINT_BITS]; } nfloat192_struct;
+typedef struct { mp_limb_t head[NFLOAT_HEADER_LIMBS]; mp_limb_t d[256 / FLINT_BITS]; } nfloat256_struct;
+typedef struct { mp_limb_t head[NFLOAT_HEADER_LIMBS]; mp_limb_t d[384 / FLINT_BITS]; } nfloat384_struct;
+typedef struct { mp_limb_t head[NFLOAT_HEADER_LIMBS]; mp_limb_t d[512 / FLINT_BITS]; } nfloat512_struct;
+typedef struct { mp_limb_t head[NFLOAT_HEADER_LIMBS]; mp_limb_t d[1024 / FLINT_BITS]; } nfloat1024_struct;
+typedef struct { mp_limb_t head[NFLOAT_HEADER_LIMBS]; mp_limb_t d[2048 / FLINT_BITS]; } nfloat2048_struct;
+
+typedef nfloat64_struct nfloat64_t[1];
+typedef nfloat128_struct nfloat128_t[1];
+typedef nfloat192_struct nfloat192_t[1];
+typedef nfloat256_struct nfloat256_t[1];
+typedef nfloat384_struct nfloat384_t[1];
+typedef nfloat512_struct nfloat512_t[1];
+typedef nfloat1024_struct nfloat1024_t[1];
+typedef nfloat2048_struct nfloat2048_t[1];
+
+#define LIMB_MSB_IS_SET(n) ((mp_limb_signed_t) (n) < 0)
+
+int nfloat_ctx_init(gr_ctx_t ctx, slong prec, int flags);
+int nfloat_ctx_write(gr_stream_t out, gr_ctx_t ctx);
+
+NFLOAT_INLINE void
+nfloat_init(nfloat_ptr res, gr_ctx_t ctx)
+{
+    NFLOAT_EXP(res) = NFLOAT_EXP_ZERO;
+    NFLOAT_SGNBIT(res) = 0;
+}
+
+NFLOAT_INLINE void
+nfloat_clear(nfloat_ptr res, gr_ctx_t ctx)
+{
+}
+
+void nfloat_swap(nfloat_ptr x, nfloat_ptr y, gr_ctx_t ctx);
+int nfloat_set(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+
+truth_t nfloat_equal(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+
+NFLOAT_INLINE
+int _nfloat_ctx_set_real_prec(gr_ctx_t ctx, slong prec)
+{
+    return GR_UNABLE;
+}
+
+NFLOAT_INLINE
+int _nfloat_ctx_get_real_prec(slong * res, gr_ctx_t ctx)
+{
+    *res = NFLOAT_CTX_PREC(ctx);
+    return GR_SUCCESS;
+}
+
+NFLOAT_INLINE int
+nfloat_zero(nfloat_ptr res, gr_ctx_t ctx)
+{
+    NFLOAT_EXP(res) = NFLOAT_EXP_ZERO;
+    NFLOAT_SGNBIT(res) = 0;
+    return GR_SUCCESS;
+}
+
+int nfloat_pos_inf(nfloat_ptr res, gr_ctx_t ctx);
+int nfloat_neg_inf(nfloat_ptr res, gr_ctx_t ctx);
+int nfloat_nan(nfloat_ptr res, gr_ctx_t ctx);
+
+int _nfloat_underflow(nfloat_ptr res, int sgnbit, gr_ctx_t ctx);
+int _nfloat_overflow(nfloat_ptr res, int sgnbit, gr_ctx_t ctx);
+
+#define NFLOAT_HANDLE_UNDERFLOW(res, ctx) \
+    do { \
+        if (NFLOAT_EXP(res) < NFLOAT_MIN_EXP) \
+            return _nfloat_underflow(res, NFLOAT_SGNBIT(res), ctx); \
+    } while (0)
+
+#define NFLOAT_HANDLE_OVERFLOW(res, ctx) \
+    do { \
+        if (NFLOAT_EXP(res) < NFLOAT_MIN_EXP) \
+            return _nfloat_underflow(res, NFLOAT_SGNBIT(res), ctx); \
+    } while (0)
+
+#define NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx) \
+    do { \
+        NFLOAT_HANDLE_UNDERFLOW(res, ctx); \
+        NFLOAT_HANDLE_OVERFLOW(res, ctx); \
+    } while (0)
+
+int nfloat_one(nfloat_ptr res, gr_ctx_t ctx);
+int nfloat_neg_one(nfloat_ptr res, gr_ctx_t ctx);
+
+NFLOAT_INLINE truth_t
+nfloat_is_zero(nfloat_srcptr x, gr_ctx_t ctx)
+{
+    if (NFLOAT_IS_NAN(x))
+        return T_UNKNOWN;
+
+    return NFLOAT_IS_ZERO(x) ? T_TRUE : T_FALSE;
+}
+
+truth_t nfloat_is_one(nfloat_srcptr x, gr_ctx_t ctx);
+truth_t nfloat_is_neg_one(nfloat_srcptr x, gr_ctx_t ctx);
+
+int nfloat_set_ui(nfloat_ptr res, ulong x, gr_ctx_t ctx);
+int nfloat_set_si(nfloat_ptr res, slong x, gr_ctx_t ctx);
+
+/* Here exp is understood such that {x, xn} is a fraction in [0, 1). */
+NFLOAT_INLINE int
+_nfloat_set_mpn_2exp(nfloat_ptr res, mp_srcptr x, mp_size_t xn, slong exp, int xsgnbit, gr_ctx_t ctx)
+{
+    mp_limb_t top;
+    slong norm;
+    slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+
+    FLINT_ASSERT(xn >= 1);
+    FLINT_ASSERT(x[xn - 1] != 0);
+
+    top = x[xn - 1];
+
+    if (LIMB_MSB_IS_SET(top))
+    {
+        if (xn >= nlimbs)
+        {
+            flint_mpn_copyi(NFLOAT_D(res), x + xn - nlimbs, nlimbs);
+        }
+        else
+        {
+            flint_mpn_zero(NFLOAT_D(res), nlimbs - xn);
+            flint_mpn_copyi(NFLOAT_D(res) + nlimbs - xn, x, xn);
+        }
+    }
+    else
+    {
+        norm = flint_clz(top);
+
+        if (xn > nlimbs)
+        {
+            mpn_lshift(NFLOAT_D(res), x + xn - nlimbs, xn, norm);
+            NFLOAT_D(res)[0] |= (x[xn - nlimbs - 1] >> (FLINT_BITS - norm));
+        }
+        else
+        {
+            flint_mpn_zero(NFLOAT_D(res), nlimbs - xn);
+            mpn_lshift(NFLOAT_D(res) + nlimbs - xn, x, xn, norm);
+        }
+
+        exp -= norm;
+    }
+
+    NFLOAT_SGNBIT(res) = xsgnbit;
+    NFLOAT_EXP(res) = exp;
+    NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+
+    return GR_SUCCESS;
+}
+
+NFLOAT_INLINE int
+nfloat_set_mpn_2exp(nfloat_ptr res, mp_srcptr x, mp_size_t xn, slong exp, int xsgnbit, gr_ctx_t ctx)
+{
+    while (xn != 0 && x[xn - 1] == 0)
+    {
+        xn--;
+        exp -= FLINT_BITS;
+    }
+
+    if (xn == 0)
+        return nfloat_zero(res, ctx);
+
+    return _nfloat_set_mpn_2exp(res, x, xn, exp, xsgnbit, ctx);
+}
+
+int nfloat_set_fmpz(nfloat_ptr res, const fmpz_t x, gr_ctx_t ctx);
+
+#ifdef ARF_H
+int nfloat_set_arf(nfloat_ptr res, const arf_t x, gr_ctx_t ctx);
+int nfloat_get_arf(arf_t res, nfloat_srcptr x, gr_ctx_t ctx);
+#endif
+
+int nfloat_write(gr_stream_t out, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_randtest(nfloat_ptr res, flint_rand_t state, gr_ctx_t ctx);
+
+int nfloat_neg(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_abs(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int _nfloat_cmp(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+int _nfloat_cmpabs(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+int nfloat_cmp(int * res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+int nfloat_cmpabs(int * res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+int nfloat_sgn(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_im(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+
+int _nfloat_add_1(nfloat_ptr res, mp_limb_t x0, slong xexp, int xsgnbit, mp_limb_t y0, slong delta, gr_ctx_t ctx);
+int _nfloat_sub_1(nfloat_ptr res, mp_limb_t x0, slong xexp, int xsgnbit, mp_limb_t y0, slong delta, gr_ctx_t ctx);
+int _nfloat_add_n(nfloat_ptr res, mp_srcptr xd, slong xexp, int xsgnbit, mp_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx);
+int _nfloat_sub_n(nfloat_ptr res, mp_srcptr xd, slong xexp, int xsgnbit, mp_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx);
+
+int nfloat_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+int nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+int nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+
+int nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+int nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx);
+int nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx);
+
+int _nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx);
+int _nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/nfloat/ctx.c b/src/nfloat/ctx.c
new file mode 100644
index 0000000000..d37319a183
--- /dev/null
+++ b/src/nfloat/ctx.c
@@ -0,0 +1,212 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "nfloat.h"
+
+int _nfloat_methods_initialized = 0;
+
+gr_static_method_table _nfloat_methods;
+
+gr_method_tab_input _nfloat_methods_input[] =
+{
+    {GR_METHOD_CTX_WRITE,       (gr_funcptr) nfloat_ctx_write},
+    {GR_METHOD_CTX_IS_RING,     (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_COMMUTATIVE_RING, (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_INTEGRAL_DOMAIN,  (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_FIELD,            (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_UNIQUE_FACTORIZATION_DOMAIN,
+                                (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_FINITE,
+                                (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_FINITE_CHARACTERISTIC,
+                                (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_ALGEBRAICALLY_CLOSED,
+                                (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_ORDERED_RING,
+                                (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_EXACT,    (gr_funcptr) gr_generic_ctx_predicate_false},
+    {GR_METHOD_CTX_IS_CANONICAL,
+                                (gr_funcptr) gr_generic_ctx_predicate_false},
+
+    {GR_METHOD_CTX_HAS_REAL_PREC, (gr_funcptr) gr_generic_ctx_predicate_true},
+    {GR_METHOD_CTX_SET_REAL_PREC, (gr_funcptr) _nfloat_ctx_set_real_prec},
+    {GR_METHOD_CTX_GET_REAL_PREC, (gr_funcptr) _nfloat_ctx_get_real_prec},
+
+    {GR_METHOD_INIT,            (gr_funcptr) nfloat_init},
+    {GR_METHOD_CLEAR,           (gr_funcptr) nfloat_clear},
+    {GR_METHOD_SWAP,            (gr_funcptr) nfloat_swap},
+    {GR_METHOD_SET_SHALLOW,     (gr_funcptr) nfloat_set},
+    {GR_METHOD_RANDTEST,        (gr_funcptr) nfloat_randtest},
+    {GR_METHOD_WRITE,           (gr_funcptr) nfloat_write},
+    {GR_METHOD_ZERO,            (gr_funcptr) nfloat_zero},
+    {GR_METHOD_ONE,             (gr_funcptr) nfloat_one},
+    {GR_METHOD_IS_ZERO,         (gr_funcptr) nfloat_is_zero},
+    {GR_METHOD_IS_ONE,          (gr_funcptr) nfloat_is_one},
+    {GR_METHOD_IS_NEG_ONE,      (gr_funcptr) nfloat_is_neg_one},
+    {GR_METHOD_EQUAL,           (gr_funcptr) nfloat_equal},
+    {GR_METHOD_SET,             (gr_funcptr) nfloat_set},
+    {GR_METHOD_SET_SI,          (gr_funcptr) nfloat_set_si},
+    {GR_METHOD_SET_UI,          (gr_funcptr) nfloat_set_ui},
+    {GR_METHOD_SET_FMPZ,        (gr_funcptr) nfloat_set_fmpz},
+/*
+    {GR_METHOD_SET_FMPQ,        (gr_funcptr) nfloat_set_fmpq},
+    {GR_METHOD_SET_D,           (gr_funcptr) nfloat_set_d},
+    {GR_METHOD_SET_STR,         (gr_funcptr) nfloat_set_str},
+    {GR_METHOD_SET_OTHER,       (gr_funcptr) nfloat_set_other},
+    {GR_METHOD_GET_FMPZ,        (gr_funcptr) nfloat_get_fmpz},
+    {GR_METHOD_GET_FMPQ,        (gr_funcptr) nfloat_get_fmpq},
+    {GR_METHOD_GET_UI,          (gr_funcptr) nfloat_get_ui},
+    {GR_METHOD_GET_SI,          (gr_funcptr) nfloat_get_si},
+    {GR_METHOD_GET_D,           (gr_funcptr) nfloat_get_d},
+*/
+
+    {GR_METHOD_NEG,             (gr_funcptr) nfloat_neg},
+    {GR_METHOD_ADD,             (gr_funcptr) nfloat_add},
+/*
+    {GR_METHOD_ADD_UI,          (gr_funcptr) nfloat_add_ui},
+    {GR_METHOD_ADD_SI,          (gr_funcptr) nfloat_add_si},
+    {GR_METHOD_ADD_FMPZ,        (gr_funcptr) nfloat_add_fmpz},
+*/
+    {GR_METHOD_SUB,             (gr_funcptr) nfloat_sub},
+/*
+    {GR_METHOD_SUB_UI,          (gr_funcptr) nfloat_sub_ui},
+    {GR_METHOD_SUB_SI,          (gr_funcptr) nfloat_sub_si},
+    {GR_METHOD_SUB_FMPZ,        (gr_funcptr) nfloat_sub_fmpz},
+*/
+    {GR_METHOD_MUL,             (gr_funcptr) nfloat_mul},
+/*
+    {GR_METHOD_MUL_UI,          (gr_funcptr) nfloat_mul_ui},
+    {GR_METHOD_MUL_SI,          (gr_funcptr) nfloat_mul_si},
+    {GR_METHOD_MUL_FMPZ,        (gr_funcptr) nfloat_mul_fmpz},
+    {GR_METHOD_MUL_TWO,         (gr_funcptr) nfloat_mul_two},
+    {GR_METHOD_ADDMUL,          (gr_funcptr) nfloat_addmul},
+    {GR_METHOD_SUBMUL,          (gr_funcptr) nfloat_submul},
+    {GR_METHOD_SQR,             (gr_funcptr) nfloat_sqr},
+*/
+    {GR_METHOD_DIV,             (gr_funcptr) nfloat_div},
+    {GR_METHOD_DIV_UI,          (gr_funcptr) nfloat_div_ui},
+    {GR_METHOD_DIV_SI,          (gr_funcptr) nfloat_div_si},
+/*
+    {GR_METHOD_DIV_FMPZ,        (gr_funcptr) nfloat_div_fmpz},
+    {GR_METHOD_INV,             (gr_funcptr) nfloat_inv},
+
+    {GR_METHOD_MUL_2EXP_SI,        (gr_funcptr) nfloat_mul_2exp_si},
+    {GR_METHOD_MUL_2EXP_FMPZ,      (gr_funcptr) nfloat_mul_2exp_fmpz},
+    {GR_METHOD_SET_FMPZ_2EXP_FMPZ, (gr_funcptr) nfloat_set_fmpz_2exp_fmpz},
+    {GR_METHOD_GET_FMPZ_2EXP_FMPZ, (gr_funcptr) nfloat_get_fmpz_2exp_fmpz},
+
+    {GR_METHOD_POW,             (gr_funcptr) nfloat_pow},
+*/
+
+/*
+    {GR_METHOD_POW_UI,          (gr_funcptr) nfloat_pow_ui},
+    {GR_METHOD_POW_SI,          (gr_funcptr) nfloat_pow_si},
+    {GR_METHOD_POW_FMPZ,        (gr_funcptr) nfloat_pow_fmpz},
+    {GR_METHOD_POW_FMPQ,        (gr_funcptr) nfloat_pow_fmpq},
+*/
+
+/*
+    {GR_METHOD_SQRT,            (gr_funcptr) nfloat_sqrt},
+    {GR_METHOD_RSQRT,           (gr_funcptr) nfloat_rsqrt},
+
+    {GR_METHOD_POS_INF,         (gr_funcptr) nfloat_pos_inf},
+    {GR_METHOD_NEG_INF,         (gr_funcptr) nfloat_neg_inf},
+    {GR_METHOD_UINF,            (gr_funcptr) gr_not_in_domain},
+    {GR_METHOD_UNDEFINED,       (gr_funcptr) nfloat_nan},
+    {GR_METHOD_UNKNOWN,         (gr_funcptr) nfloat_nan},
+
+    {GR_METHOD_FLOOR,           (gr_funcptr) nfloat_floor},
+    {GR_METHOD_CEIL,            (gr_funcptr) nfloat_ceil},
+    {GR_METHOD_TRUNC,           (gr_funcptr) nfloat_trunc},
+    {GR_METHOD_NINT,            (gr_funcptr) nfloat_nint},
+*/
+
+    {GR_METHOD_ABS,             (gr_funcptr) nfloat_abs},
+    {GR_METHOD_CONJ,            (gr_funcptr) nfloat_set},
+    {GR_METHOD_RE,              (gr_funcptr) nfloat_set},
+    {GR_METHOD_IM,              (gr_funcptr) nfloat_im},
+    {GR_METHOD_SGN,             (gr_funcptr) nfloat_sgn},
+    {GR_METHOD_CSGN,            (gr_funcptr) nfloat_sgn},
+    {GR_METHOD_CMP,             (gr_funcptr) nfloat_cmp},
+    {GR_METHOD_CMPABS,          (gr_funcptr) nfloat_cmpabs},
+
+    {GR_METHOD_I,               (gr_funcptr) gr_not_in_domain},
+/*
+    {GR_METHOD_PI,              (gr_funcptr) nfloat_pi},
+    {GR_METHOD_EXP,             (gr_funcptr) nfloat_exp},
+    {GR_METHOD_EXPM1,           (gr_funcptr) nfloat_expm1},
+    {GR_METHOD_LOG,             (gr_funcptr) nfloat_log},
+    {GR_METHOD_LOG1P,           (gr_funcptr) nfloat_log1p},
+    {GR_METHOD_SIN,             (gr_funcptr) nfloat_sin},
+    {GR_METHOD_COS,             (gr_funcptr) nfloat_cos},
+    {GR_METHOD_TAN,             (gr_funcptr) nfloat_tan},
+    {GR_METHOD_SINH,            (gr_funcptr) nfloat_sinh},
+    {GR_METHOD_COSH,            (gr_funcptr) nfloat_cosh},
+    {GR_METHOD_TANH,            (gr_funcptr) nfloat_tanh},
+    {GR_METHOD_ATAN,            (gr_funcptr) nfloat_atan},
+    {GR_METHOD_GAMMA,            (gr_funcptr) nfloat_gamma},
+    {GR_METHOD_ZETA,             (gr_funcptr) nfloat_zeta},
+*/
+
+/*
+    {GR_METHOD_VEC_ADD,                 (gr_funcptr) _nfloat_vec_add},
+    {GR_METHOD_VEC_SUB,                 (gr_funcptr) _nfloat_vec_sub},
+*/
+    {GR_METHOD_VEC_DOT,         (gr_funcptr) _nfloat_vec_dot},
+    {GR_METHOD_VEC_DOT_REV,     (gr_funcptr) _nfloat_vec_dot_rev},
+/*
+    {GR_METHOD_POLY_MULLOW,     (gr_funcptr) nfloat_poly_mullow},
+    {GR_METHOD_POLY_ROOTS_OTHER,(gr_funcptr) nfloat_poly_roots_other},
+    {GR_METHOD_MAT_MUL,         (gr_funcptr) nfloat_mat_mul},
+    {GR_METHOD_MAT_DET,         (gr_funcptr) gr_mat_det_generic_field},
+    {GR_METHOD_MAT_FIND_NONZERO_PIVOT,     (gr_funcptr) gr_mat_find_nonzero_pivot_large_abs},
+*/
+
+    {0,                         (gr_funcptr) NULL},
+};
+
+int
+nfloat_ctx_init(gr_ctx_t ctx, slong prec, int flags)
+{
+    slong nlimbs;
+
+    if (prec <= 0 || prec > NFLOAT_MAX_LIMBS * FLINT_BITS)
+        return GR_UNABLE;
+
+    nlimbs = (prec + FLINT_BITS - 1) / FLINT_BITS;
+
+    ctx->which_ring = GR_CTX_NFLOAT;
+    ctx->sizeof_elem = sizeof(mp_limb_t) * (nlimbs + NFLOAT_HEADER_LIMBS);
+    ctx->size_limit = WORD_MAX;
+
+    NFLOAT_CTX_NLIMBS(ctx) = nlimbs;
+    NFLOAT_CTX_FLAGS(ctx) = flags;
+    NFLOAT_CTX_RND(ctx) = 0;
+
+    ctx->methods = _nfloat_methods;
+
+    if (!_nfloat_methods_initialized)
+    {
+        gr_method_tab_init(_nfloat_methods, _nfloat_methods_input);
+        _nfloat_methods_initialized = 1;
+    }
+
+    return GR_SUCCESS;
+}
+
+int
+nfloat_ctx_write(gr_stream_t out, gr_ctx_t ctx)
+{
+    gr_stream_write(out, "Floating-point numbers with prec = ");
+    gr_stream_write_si(out, NFLOAT_CTX_PREC(ctx));
+    gr_stream_write(out, " (nfloat)");
+    return GR_SUCCESS;
+}
diff --git a/src/nfloat/dot.c b/src/nfloat/dot.c
new file mode 100644
index 0000000000..546e032cf5
--- /dev/null
+++ b/src/nfloat/dot.c
@@ -0,0 +1,377 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "nfloat.h"
+
+/* Having some dummy code in the continue block makes the loop go faster. Don't ask me why. */
+#if 1
+#define DUMMY_OP add_ssaaaa(s1, s0, s1, s0, 0, 0)
+#else
+#define DUMMY_OP 0
+#endif
+
+/* Experimental: use fast but inaccurate product? */
+#define FLINT_MPN_MUL_2X2H(r3, r2, r1, a1, a0, b1, b0)   \
+    do {                                                                  \
+        mp_limb_t __t1, __t2, __u1, __u2, __v3, __v2;                     \
+        mp_limb_t __r3, __r2, __r1, __r0;                                 \
+        mp_limb_t __a1 = (a1), __a0 = (a0), __b1 = (b1), __b0 = (b0);     \
+        umul_ppmm(__t2, __t1, __a0, __b1);                                \
+        umul_ppmm(__u2, __u1, __a1, __b0);                                \
+        add_sssaaaaaa(__r3, __r2, __r1, 0, __t2, __t1, 0, __u2, __u1);    \
+        umul_ppmm(__v3, __v2, __a1, __b1);                                \
+        add_ssaaaa(__r3, __r2, __r3, __r2, __v3, __v2);                   \
+        (r1) = __r1; (r2) = __r2; (r3) = __r3;                            \
+    } while (0)
+
+int
+__nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, slong sizeof_xstep, nfloat_srcptr y, slong sizeof_ystep, slong len, gr_ctx_t ctx)
+{
+    if (NFLOAT_CTX_NLIMBS(ctx) == 1 && !(NFLOAT_CTX_FLAGS(ctx) & (NFLOAT_ALLOW_INF | NFLOAT_ALLOW_NAN)))
+    {
+        slong i, xexp, delta, sexp, norm, pad_bits;
+        int xsgnbit;
+        mp_limb_t t1, t0, s1, s0;
+        nfloat_srcptr xi, yi;
+        int have_zeros = 0;
+
+        s1 = s0 = 0;
+        sexp = WORD_MIN;
+
+        if (initial != NULL && !NFLOAT_IS_ZERO(initial))
+            sexp = NFLOAT_EXP(initial);
+
+        /* Lookahead to determine a working exponent. This is about 25% slower
+           than updating the exponent in a single loop, but easier, so
+           let's go with it for now. */
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
+        {
+            if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
+            {
+                /* DUMMY_OP; */
+                have_zeros = 1;
+            }
+            else
+            {
+                xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+                sexp = FLINT_MAX(sexp, xexp);
+            }
+        }
+
+        if (sexp == WORD_MIN)
+            return nfloat_zero(res, ctx);
+
+        pad_bits = FLINT_BIT_COUNT(len + 1) + 1;
+        sexp += pad_bits;
+
+        if (initial != NULL && !NFLOAT_IS_ZERO(initial))
+        {
+            xexp = NFLOAT_EXP(initial);
+            xsgnbit = NFLOAT_SGNBIT(initial) ^ subtract;
+            delta = sexp - xexp;
+
+            if (delta < FLINT_BITS)
+            {
+                t1 = NFLOAT_D(initial)[0];
+                s1 = t1 >> delta;
+                s0 = t1 << (FLINT_BITS - delta);
+                if (xsgnbit)
+                    sub_ddmmss(s1, s0, 0, 0, s1, s0);
+            }
+            else if (delta < 2 * FLINT_BITS)
+            {
+                t1 = NFLOAT_D(initial)[0];
+                s0 = t1 >> (delta - FLINT_BITS);
+                if (xsgnbit)
+                    sub_ddmmss(s1, s0, 0, 0, s1, s0);
+            }
+        }
+
+        if (have_zeros)
+        {
+            for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
+            {
+                if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
+                {
+                    DUMMY_OP;
+                    continue;
+                }
+
+                xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+                delta = sexp - xexp;
+
+                if (delta < FLINT_BITS)
+                {
+                    xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
+                    umul_ppmm(t1, t0, NFLOAT_D(xi)[0], NFLOAT_D(yi)[0]);
+                    if (xsgnbit)
+                        sub_ddmmss(s1, s0, s1, s0, t1 >> delta, (t0 >> delta) | (t1 << (FLINT_BITS - delta)));
+                    else
+                        add_ssaaaa(s1, s0, s1, s0, t1 >> delta, (t0 >> delta) | (t1 << (FLINT_BITS - delta)));
+                }
+                else if (delta < 2 * FLINT_BITS)
+                {
+                    xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
+                    umul_ppmm(t1, t0, NFLOAT_D(xi)[0], NFLOAT_D(yi)[0]);
+                    if (xsgnbit)
+                        sub_ddmmss(s1, s0, s1, s0, 0, t1 >> (delta - FLINT_BITS));
+                    else
+                        add_ssaaaa(s1, s0, s1, s0, 0, t1 >> (delta - FLINT_BITS));
+                }
+            }
+        }
+        else
+        {
+            for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
+            {
+                xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+                delta = sexp - xexp;
+
+                if (delta < FLINT_BITS)
+                {
+                    xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
+                    umul_ppmm(t1, t0, NFLOAT_D(xi)[0], NFLOAT_D(yi)[0]);
+                    if (xsgnbit)
+                        sub_ddmmss(s1, s0, s1, s0, t1 >> delta, (t0 >> delta) | (t1 << (FLINT_BITS - delta)));
+                    else
+                        add_ssaaaa(s1, s0, s1, s0, t1 >> delta, (t0 >> delta) | (t1 << (FLINT_BITS - delta)));
+                }
+                else if (delta < 2 * FLINT_BITS)
+                {
+                    xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
+                    umul_ppmm(t1, t0, NFLOAT_D(xi)[0], NFLOAT_D(yi)[0]);
+                    if (xsgnbit)
+                        sub_ddmmss(s1, s0, s1, s0, 0, t1 >> (delta - FLINT_BITS));
+                    else
+                        add_ssaaaa(s1, s0, s1, s0, 0, t1 >> (delta - FLINT_BITS));
+                }
+            }
+        }
+
+        if (LIMB_MSB_IS_SET(s1))
+        {
+            sub_ddmmss(s1, s0, 0, 0, s1, s0);
+            xsgnbit = 1;
+        }
+        else
+        {
+            xsgnbit = 0;
+        }
+
+        NFLOAT_SGNBIT(res) = xsgnbit ^ subtract;
+
+        if (s1 != 0)
+        {
+            norm = flint_clz(s1);
+            if (norm)
+                NFLOAT_D(res)[0] = (s1 << norm) | (s0 >> (FLINT_BITS - norm));
+            else
+                NFLOAT_D(res)[0] = s1;
+            NFLOAT_EXP(res) = sexp - norm;
+        }
+        else if (s0 != 0)
+        {
+            norm = flint_clz(s0);
+            NFLOAT_D(res)[0] = s0 << norm;
+            NFLOAT_EXP(res) = sexp - FLINT_BITS - norm;
+        }
+        else
+        {
+            return nfloat_zero(res, ctx);
+        }
+
+        NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+        return GR_SUCCESS;
+    }
+
+    if (!(NFLOAT_CTX_FLAGS(ctx) & (NFLOAT_ALLOW_INF | NFLOAT_ALLOW_NAN)))
+    {
+        slong i, xexp, delta, sexp, norm, pad_bits;
+        int xsgnbit;
+        mp_limb_t t[NFLOAT_MAX_LIMBS + 1];
+        mp_limb_t s[NFLOAT_MAX_LIMBS + 1];
+        nfloat_srcptr xi, yi;
+        mp_limb_t s0, s1;
+        slong n;
+
+        slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+
+        s0 = s1 = 0;
+        sexp = WORD_MIN;
+
+        if (initial != NULL && !NFLOAT_IS_ZERO(initial))
+            sexp = NFLOAT_EXP(initial);
+
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
+        {
+            if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
+            {
+                DUMMY_OP;
+                continue;
+            }
+
+            xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+            sexp = FLINT_MAX(sexp, xexp);
+        }
+
+        if (sexp == WORD_MIN)
+            return nfloat_zero(res, ctx);
+
+        flint_mpn_zero(s, nlimbs + 1);
+
+        pad_bits = FLINT_BIT_COUNT(len + 1) + 1;
+        sexp += pad_bits;
+
+        if (initial != NULL && !NFLOAT_IS_ZERO(initial))
+        {
+            xexp = NFLOAT_EXP(initial);
+            xsgnbit = NFLOAT_SGNBIT(initial) ^ subtract;
+            delta = sexp - xexp;
+
+            if (delta < FLINT_BITS)
+            {
+                mpn_rshift(s + 1, NFLOAT_D(initial), nlimbs, delta);
+                s[0] = NFLOAT_D(initial)[0] << (FLINT_BITS - delta);
+                if (xsgnbit)
+                    mpn_neg(s, s, nlimbs + 1);
+            }
+            else
+            {
+                slong delta_limbs, delta_bits;
+
+                delta_limbs = delta / FLINT_BITS;
+                delta_bits = delta % FLINT_BITS;
+
+                if (delta_limbs < nlimbs + 1)
+                {
+                    if (delta_bits == 0)
+                        flint_mpn_copyi(s, NFLOAT_D(initial) + delta_limbs - 1, nlimbs + 1 - delta_limbs);
+                    else
+                        mpn_rshift(s, NFLOAT_D(initial) + delta_limbs - 1, nlimbs + 1 - delta_limbs, delta_bits);
+
+                    if (xsgnbit)
+                        mpn_neg(s, s, nlimbs + 1);
+                }
+            }
+        }
+
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
+        {
+            if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
+            {
+                DUMMY_OP;
+                continue;
+            }
+
+            xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+            xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
+            delta = sexp - xexp;
+
+            if (delta < FLINT_BITS)
+            {
+                t[0] = flint_mpn_mulhigh_n(t + 1, NFLOAT_D(xi), NFLOAT_D(yi), nlimbs);
+                mpn_rshift(t, t, nlimbs + 1, delta);
+
+                if (xsgnbit)
+                    mpn_sub_n(s, s, t, nlimbs + 1);
+                else
+                    mpn_add_n(s, s, t, nlimbs + 1);
+            }
+            else
+            {
+                slong delta_limbs, delta_bits;
+
+                delta_limbs = delta / FLINT_BITS;
+                delta_bits = delta % FLINT_BITS;
+
+                /* alternative criterion: if (delta < (FLINT_BITS * nlimbs) + 2 * pad_bits) */
+                if (delta_limbs < nlimbs + 1)
+                {
+                    /* todo: squaring case */
+                    flint_mpn_mulhigh_n(t, NFLOAT_D(xi) + delta_limbs - 1, NFLOAT_D(yi) + delta_limbs - 1, nlimbs + 1 - delta_limbs);
+
+                    if (delta_bits != 0)
+                        mpn_rshift(t, t, nlimbs + 1 - delta_limbs, delta_bits);
+
+                    if (xsgnbit)
+                        mpn_sub(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs);
+                    else
+                        mpn_add(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs);
+                }
+                else
+                {
+                    /* Skip term. */
+                }
+            }
+        }
+
+        if (LIMB_MSB_IS_SET(s[nlimbs]))
+        {
+            mpn_neg(s, s, nlimbs + 1);
+            xsgnbit = 1;
+        }
+        else
+        {
+            xsgnbit = 0;
+        }
+
+        NFLOAT_SGNBIT(res) = xsgnbit ^ subtract;
+
+        n = nlimbs + 1;
+        MPN_NORM(s, n);
+
+        xexp = sexp - (nlimbs + 1 - n) * FLINT_BITS;
+
+        if (n == nlimbs + 1)
+        {
+            norm = flint_clz(s[n - 1]);
+            if (norm)
+            {
+                mpn_lshift(NFLOAT_D(res), s + 1, nlimbs, norm);
+                NFLOAT_D(res)[0] |= (s[0] >> (FLINT_BITS - norm));
+            }
+            else
+                flint_mpn_copyi(NFLOAT_D(res), s + 1, nlimbs);
+            xexp -= norm;
+        }
+        else
+        {
+            if (n == 0)
+                return nfloat_zero(res, ctx);
+
+            norm = flint_clz(s[n - 1]);
+            if (norm)
+                mpn_lshift(NFLOAT_D(res) + nlimbs - n, s, n, norm);
+            else
+                flint_mpn_copyi(NFLOAT_D(res) + nlimbs - n, s, n);
+            flint_mpn_zero(NFLOAT_D(res), nlimbs - n);
+            xexp -= norm;
+        }
+
+        NFLOAT_EXP(res) = xexp;
+        NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+        return GR_SUCCESS;
+    }
+
+    return GR_UNABLE;
+}
+
+int
+_nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
+{
+    return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, y, ctx->sizeof_elem, len, ctx);
+}
+
+int
+_nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
+{
+    return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, GR_ENTRY(y, len - 1, ctx->sizeof_elem), -ctx->sizeof_elem, len, ctx);
+}
diff --git a/src/nfloat/inlines.c b/src/nfloat/inlines.c
new file mode 100644
index 0000000000..10fdf3a493
--- /dev/null
+++ b/src/nfloat/inlines.c
@@ -0,0 +1,14 @@
+/*
+    Copyright (C) 2015 William Hart
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#define NFLOAT_INLINES_C
+
+#include "nfloat.h"
diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c
new file mode 100644
index 0000000000..a609e893bc
--- /dev/null
+++ b/src/nfloat/nfloat.c
@@ -0,0 +1,1052 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include <limits.h>
+#include <mpfr.h>
+#include "fmpz.h"
+#include "arf.h"
+#include "nfloat.h"
+
+int
+nfloat_write(gr_stream_t out, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    gr_ctx_t arf_ctx;
+    arf_t t;
+    int status;
+
+    gr_ctx_init_real_float_arf(arf_ctx, NFLOAT_CTX_PREC(ctx));
+    arf_init(t);
+    nfloat_get_arf(t, x, ctx);
+    status = gr_write(out, t, arf_ctx);
+    arf_clear(t);
+    return status;
+    gr_ctx_clear(arf_ctx);
+}
+
+int
+nfloat_randtest(nfloat_ptr res, flint_rand_t state, gr_ctx_t ctx)
+{
+    arf_t t;
+    int status;
+
+    arf_init(t);
+    arf_randtest(t, state, NFLOAT_CTX_PREC(ctx), 10);
+    status = nfloat_set_arf(res, t, ctx);
+    arf_clear(t);
+    return status;
+}
+
+void
+nfloat_swap(nfloat_ptr x, nfloat_ptr y, gr_ctx_t ctx)
+{
+    slong i, n = NFLOAT_CTX_DATA_NLIMBS(ctx);
+
+    for (i = 0; i < n; i++)
+        FLINT_SWAP(mp_limb_t, NFLOAT_DATA(x)[i], NFLOAT_DATA(y)[i]);
+}
+
+int
+nfloat_set(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    slong i, n = NFLOAT_CTX_DATA_NLIMBS(ctx);
+
+    for (i = 0; i < n; i++)
+        NFLOAT_DATA(res)[i] = NFLOAT_DATA(x)[i];
+
+    return GR_SUCCESS;
+}
+
+truth_t
+nfloat_equal(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    slong i, n = NFLOAT_CTX_NLIMBS(ctx);
+
+    if (NFLOAT_IS_SPECIAL(x) || NFLOAT_IS_SPECIAL(y))
+    {
+        return (NFLOAT_EXP(x) == NFLOAT_EXP(y) &&
+                NFLOAT_SGNBIT(x) == NFLOAT_SGNBIT(y)) ? T_TRUE : T_FALSE;
+    }
+
+    if (NFLOAT_EXP(x) != NFLOAT_EXP(y))
+        return T_FALSE;
+
+    if (NFLOAT_SGNBIT(x) != NFLOAT_SGNBIT(y))
+        return T_FALSE;
+
+    for (i = 0; i < n; i++)
+        if (NFLOAT_D(x)[i] != NFLOAT_D(y)[i])
+            return T_FALSE;
+
+    return T_TRUE;
+}
+
+int
+nfloat_one(nfloat_ptr res, gr_ctx_t ctx)
+{
+    slong n = NFLOAT_CTX_NLIMBS(ctx);
+    NFLOAT_EXP(res) = 1;
+    NFLOAT_SGNBIT(res) = 0;
+    flint_mpn_zero(NFLOAT_D(res), n - 1);
+    NFLOAT_D(res)[n - 1] = UWORD(1) << (FLINT_BITS - 1);
+    return GR_SUCCESS;
+}
+
+int
+nfloat_neg_one(nfloat_ptr res, gr_ctx_t ctx)
+{
+    slong n = NFLOAT_CTX_NLIMBS(ctx);
+    NFLOAT_EXP(res) = 1;
+    NFLOAT_SGNBIT(res) = 1;
+    flint_mpn_zero(NFLOAT_D(res), n - 1);
+    NFLOAT_D(res)[n - 1] = UWORD(1) << (FLINT_BITS - 1);
+    return GR_SUCCESS;
+}
+
+truth_t
+nfloat_is_one(nfloat_srcptr x, gr_ctx_t ctx)
+{
+    slong n = NFLOAT_CTX_NLIMBS(ctx);
+
+    if (NFLOAT_IS_NAN(x))
+        return T_UNKNOWN;
+
+    return ((NFLOAT_EXP(x) == 1) && (NFLOAT_SGNBIT(x) == 0) &&
+           flint_mpn_zero_p(NFLOAT_D(x), n - 1) &&
+            (NFLOAT_D(x)[n - 1] == (UWORD(1) << (FLINT_BITS - 1)))) ? T_TRUE : T_FALSE;
+}
+
+truth_t
+nfloat_is_neg_one(nfloat_srcptr x, gr_ctx_t ctx)
+{
+    slong n = NFLOAT_CTX_NLIMBS(ctx);
+
+    if (NFLOAT_IS_NAN(x))
+        return T_UNKNOWN;
+
+    return (NFLOAT_EXP(x) == 1) && (NFLOAT_SGNBIT(x) == 1) &&
+           flint_mpn_zero_p(NFLOAT_D(x), n - 1) &&
+            (NFLOAT_D(x)[n - 1] == (UWORD(1) << (FLINT_BITS - 1))) ? T_TRUE : T_FALSE;
+}
+
+int
+nfloat_pos_inf(nfloat_ptr res, gr_ctx_t ctx)
+{
+    if (!(NFLOAT_CTX_FLAGS(ctx) & NFLOAT_ALLOW_INF))
+        return GR_DOMAIN;
+
+    NFLOAT_EXP(res) = NFLOAT_EXP_POS_INF;
+    NFLOAT_SGNBIT(res) = 0;
+    return GR_SUCCESS;
+}
+
+int
+nfloat_neg_inf(nfloat_ptr res, gr_ctx_t ctx)
+{
+    if (!(NFLOAT_CTX_FLAGS(ctx) & NFLOAT_ALLOW_INF))
+        return GR_DOMAIN;
+
+    NFLOAT_EXP(res) = NFLOAT_EXP_NEG_INF;
+    NFLOAT_SGNBIT(res) = 0;
+    return GR_SUCCESS;
+}
+
+int
+nfloat_nan(nfloat_ptr res, gr_ctx_t ctx)
+{
+    if (!(NFLOAT_CTX_FLAGS(ctx) & NFLOAT_ALLOW_NAN))
+        return GR_UNABLE;
+
+    NFLOAT_EXP(res) = NFLOAT_EXP_NAN;
+    NFLOAT_SGNBIT(res) = 0;
+    return GR_SUCCESS;
+}
+
+int
+_nfloat_underflow(nfloat_ptr res, int sgnbit, gr_ctx_t ctx)
+{
+    if (!(NFLOAT_CTX_FLAGS(ctx) & NFLOAT_ALLOW_UNDERFLOW))
+        return GR_UNABLE;
+
+    return nfloat_zero(res, ctx);
+}
+
+int
+_nfloat_overflow(nfloat_ptr res, int sgnbit, gr_ctx_t ctx)
+{
+    return sgnbit ? nfloat_neg_inf(res, ctx) : nfloat_pos_inf(res, ctx);
+}
+
+int
+nfloat_set_ui(nfloat_ptr res, ulong x, gr_ctx_t ctx)
+{
+    if (x == 0)
+    {
+        return nfloat_zero(res, ctx);
+    }
+    else
+    {
+        slong n = NFLOAT_CTX_NLIMBS(ctx);
+        slong norm = flint_clz(x);
+
+        NFLOAT_EXP(res) = FLINT_BITS - norm;
+        NFLOAT_SGNBIT(res) = 0;
+        flint_mpn_zero(NFLOAT_D(res), n - 1);
+        NFLOAT_D(res)[n - 1] = x << norm;
+        return GR_SUCCESS;
+    }
+}
+
+int
+nfloat_set_si(nfloat_ptr res, slong x, gr_ctx_t ctx)
+{
+    if (x == 0)
+    {
+        return nfloat_zero(res, ctx);
+    }
+    else
+    {
+        ulong ux = (x > 0) ? x : -x;
+        slong n = NFLOAT_CTX_NLIMBS(ctx);
+        slong norm = flint_clz(ux);
+
+        NFLOAT_EXP(res) = FLINT_BITS - norm;
+        NFLOAT_SGNBIT(res) = (x < 0);
+        flint_mpn_zero(NFLOAT_D(res), n - 1);
+        NFLOAT_D(res)[n - 1] = ux << norm;
+        return GR_SUCCESS;
+    }
+}
+
+int
+nfloat_set_fmpz(nfloat_ptr res, const fmpz_t x, gr_ctx_t ctx)
+{
+    if (!COEFF_IS_MPZ(*x))
+    {
+        return nfloat_set_si(res, *x, ctx);
+    }
+    else
+    {
+        mpz_ptr z = COEFF_TO_PTR(*x);
+        slong zn = z->_mp_size;
+
+        if (zn > 0)
+            return _nfloat_set_mpn_2exp(res, z->_mp_d, zn, zn * FLINT_BITS, 0, ctx);
+        else
+            return _nfloat_set_mpn_2exp(res, z->_mp_d, -zn, -zn * FLINT_BITS, 1, ctx);
+    }
+}
+
+int
+nfloat_set_arf(nfloat_ptr res, const arf_t x, gr_ctx_t ctx)
+{
+    if (arf_is_special(x))
+    {
+        if (arf_is_zero(x))
+            return nfloat_zero(res, ctx);
+        else if (arf_is_pos_inf(x))
+            return nfloat_pos_inf(res, ctx);
+        else if (arf_is_neg_inf(x))
+            return nfloat_neg_inf(res, ctx);
+        else
+            return nfloat_nan(res, ctx);
+    }
+    else
+    {
+        slong exp, xn;
+        mp_srcptr xp;
+        int sgnbit;
+
+        ARF_GET_MPN_READONLY(xp, xn, x);
+
+        exp = ARF_EXP(x);
+        sgnbit = ARF_SGNBIT(x);
+
+        if (COEFF_IS_MPZ(exp) || exp < NFLOAT_MIN_EXP || exp > NFLOAT_MAX_EXP)
+        {
+            if (fmpz_sgn(ARF_EXPREF(x)) < 0)
+                return _nfloat_underflow(res, sgnbit, ctx);
+            else
+                return _nfloat_overflow(res, sgnbit, ctx);
+        }
+
+        _nfloat_set_mpn_2exp(res, xp, xn, exp, sgnbit, ctx);
+    }
+
+    return GR_SUCCESS;
+}
+
+int
+nfloat_get_arf(arf_t res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    if (NFLOAT_IS_SPECIAL(x))
+    {
+        if (NFLOAT_IS_ZERO(x))
+            arf_zero(res);
+        else if (NFLOAT_IS_POS_INF(x))
+            arf_pos_inf(res);
+        else if (NFLOAT_IS_NEG_INF(x))
+            arf_neg_inf(res);
+        else
+            arf_nan(res);
+    }
+    else
+    {
+        arf_set_mpn(res, NFLOAT_D(x), NFLOAT_CTX_NLIMBS(ctx), NFLOAT_SGNBIT(x));
+        arf_mul_2exp_si(res, res, NFLOAT_EXP(x) - FLINT_BITS * NFLOAT_CTX_NLIMBS(ctx));
+    }
+
+    return GR_SUCCESS;
+}
+
+int
+nfloat_neg(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    if (res != x)
+    {
+        slong i, n = NFLOAT_CTX_DATA_NLIMBS(ctx);
+
+        for (i = 0; i < n; i++)
+            NFLOAT_DATA(res)[i] = NFLOAT_DATA(x)[i];
+    }
+
+    if (NFLOAT_IS_SPECIAL(res))
+    {
+        if (NFLOAT_IS_POS_INF(res))
+            NFLOAT_EXP(res) = NFLOAT_EXP_NEG_INF;
+        else if (NFLOAT_IS_NEG_INF(res))
+            NFLOAT_EXP(res) = NFLOAT_EXP_POS_INF;
+    }
+    else
+    {
+        NFLOAT_SGNBIT(res) ^= 1;
+    }
+
+    return GR_SUCCESS;
+}
+
+int
+nfloat_abs(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    if (res != x)
+    {
+        slong i, n = NFLOAT_CTX_DATA_NLIMBS(ctx);
+
+        for (i = 0; i < n; i++)
+            NFLOAT_DATA(res)[i] = NFLOAT_DATA(x)[i];
+    }
+
+    if (NFLOAT_IS_SPECIAL(res))
+    {
+        if (NFLOAT_IS_NEG_INF(res))
+            NFLOAT_EXP(res) = NFLOAT_EXP_POS_INF;
+    }
+    else
+    {
+        NFLOAT_SGNBIT(res) = 0;
+    }
+
+    return GR_SUCCESS;
+}
+
+int
+_nfloat_cmp(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    int xs, ys, mc;
+    slong xe, ye;
+
+    if (NFLOAT_IS_SPECIAL(x) || NFLOAT_IS_SPECIAL(y))
+    {
+        if (NFLOAT_CTX_HAS_INF_NAN(ctx))
+        {
+            if (NFLOAT_IS_NAN(x) || NFLOAT_IS_NAN(y))
+                return 0;
+            if (NFLOAT_IS_POS_INF(x)) return NFLOAT_IS_POS_INF(y) ? 0 : 1;
+            if (NFLOAT_IS_NEG_INF(x)) return NFLOAT_IS_NEG_INF(y) ? 0 : -1;
+            if (NFLOAT_IS_POS_INF(y)) return -1;
+            if (NFLOAT_IS_NEG_INF(y)) return 1;
+        }
+
+        if (NFLOAT_IS_ZERO(x) && NFLOAT_IS_ZERO(y)) return 0;
+        if (NFLOAT_IS_ZERO(x)) return NFLOAT_SGNBIT(y) ? 1 : -1;
+        if (NFLOAT_IS_ZERO(y)) return NFLOAT_SGNBIT(x) ? -1 : 1;
+    }
+
+    xs = NFLOAT_SGNBIT(x);
+    ys = NFLOAT_SGNBIT(y);
+
+    if (xs != ys)
+        return xs ? -1 : 1;
+
+    xe = NFLOAT_EXP(x);
+    ye = NFLOAT_EXP(y);
+
+    if (xe != ye)
+        return ((xe < ye) ^ xs) ? -1 : 1;
+
+    mc = mpn_cmp(NFLOAT_D(x), NFLOAT_D(y), NFLOAT_CTX_NLIMBS(ctx));
+
+    if (mc != 0)
+        return ((mc < 0) ^ xs) ? -1 : 1;
+
+    return 0;
+}
+
+int
+_nfloat_cmpabs(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    int mc;
+    slong xe, ye;
+
+    if (NFLOAT_IS_SPECIAL(x) || NFLOAT_IS_SPECIAL(y))
+    {
+        if (NFLOAT_CTX_HAS_INF_NAN(ctx))
+        {
+            if (NFLOAT_IS_NAN(x) || NFLOAT_IS_NAN(y))
+                return 0;
+            if (NFLOAT_IS_INF(x)) return NFLOAT_IS_INF(y) ? 0 : 1;
+            if (NFLOAT_IS_INF(y)) return -1;
+        }
+
+        if (NFLOAT_IS_ZERO(x) && NFLOAT_IS_ZERO(y)) return 0;
+        if (NFLOAT_IS_ZERO(x)) return -1;
+        if (NFLOAT_IS_ZERO(y)) return 1;
+    }
+
+    xe = NFLOAT_EXP(x);
+    ye = NFLOAT_EXP(y);
+
+    if (xe != ye)
+        return (xe < ye) ? -1 : 1;
+
+    mc = mpn_cmp(NFLOAT_D(x), NFLOAT_D(y), NFLOAT_CTX_NLIMBS(ctx));
+
+    if (mc != 0)
+        return (mc < 0) ? -1 : 1;
+
+    return 0;
+}
+
+int
+nfloat_cmp(int * res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    if (NFLOAT_IS_NAN(x) || NFLOAT_IS_NAN(y))
+        return GR_UNABLE;
+
+    *res = _nfloat_cmp(x, y, ctx);
+    return GR_SUCCESS;
+}
+
+int
+nfloat_cmpabs(int * res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    if (NFLOAT_IS_NAN(x) || NFLOAT_IS_NAN(y))
+        return GR_UNABLE;
+
+    *res = _nfloat_cmpabs(x, y, ctx);
+    return GR_SUCCESS;
+}
+
+int
+nfloat_sgn(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    if (NFLOAT_IS_SPECIAL(x))
+    {
+        if (NFLOAT_IS_ZERO(x))
+            return nfloat_zero(res, ctx);
+
+        if (NFLOAT_IS_POS_INF(x))
+            return nfloat_one(res, ctx);
+
+        if (NFLOAT_IS_NEG_INF(x))
+            return nfloat_neg_one(res, ctx);
+
+        return nfloat_nan(res, ctx);
+    }
+    else
+    {
+        return NFLOAT_SGNBIT(x) ? nfloat_neg_one(res, ctx) : nfloat_one(res, ctx);
+    }
+}
+
+int
+nfloat_im(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    return nfloat_zero(res, ctx);
+}
+
+static mp_limb_pair_t
+_flint_mpn_mulhigh_normalised2(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+    mp_limb_pair_t ret;
+
+    FLINT_ASSERT(n >= 1);
+
+    if (rp == xp || rp == yp)
+    {
+        mp_ptr t;
+        TMP_INIT;
+        TMP_START;
+        t = TMP_ALLOC(sizeof(mp_limb_t) * n);
+        ret = _flint_mpn_mulhigh_normalised2(t, xp, yp, n);
+        flint_mpn_copyi(rp, t, n);
+        TMP_END;
+        return ret;
+    }
+
+    if (xp == yp)
+        ret.m1 = flint_mpn_sqrhigh(rp, xp, n);
+    else
+        ret.m1 = flint_mpn_mulhigh_n(rp, xp, yp, n);
+
+    if (LIMB_MSB_IS_SET(rp[n - 1]))
+    {
+        ret.m2 = 0;
+    }
+    else
+    {
+        ret.m2 = 1;
+        mpn_lshift(rp, rp, n, 1);
+        rp[0] |= (ret.m1 >> (FLINT_BITS - 1));
+        ret.m1 <<= 1;
+    }
+
+    return ret;
+}
+
+/* handles aliasing */
+FLINT_FORCE_INLINE
+mp_limb_pair_t flint_mpn_mulhigh_normalised2(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+    FLINT_ASSERT(n >= 1);
+
+    if (FLINT_HAVE_MULHIGH_NORMALISED_FUNC(n))
+        return flint_mpn_mulhigh_normalised_func_tab[n](rp, xp, yp);
+    else
+        return _flint_mpn_mulhigh_normalised2(rp, xp, yp, n);
+}
+
+FLINT_FORCE_INLINE int
+n_signed_sub(ulong * r, ulong x, ulong y)
+{
+    if (x >= y)
+    {
+        *r = x - y;
+        return 0;
+    }
+    else
+    {
+        *r = y - x;
+        return 1;
+    }
+}
+
+int
+_nfloat_add_1(nfloat_ptr res, mp_limb_t x0, slong xexp, int xsgnbit, mp_limb_t y0, slong delta, gr_ctx_t ctx)
+{
+    mp_limb_t hi, lo;
+
+    NFLOAT_SGNBIT(res) = xsgnbit;
+
+    if (delta < FLINT_BITS)
+    {
+        add_ssaaaa(hi, lo, 0, x0, 0, y0 >> delta);
+
+        if (hi == 0)
+        {
+            NFLOAT_D(res)[0] = lo;
+            NFLOAT_EXP(res) = xexp;
+        }
+        else
+        {
+            NFLOAT_D(res)[0] = (lo >> 1) | (UWORD(1) << (FLINT_BITS - 1));
+            NFLOAT_EXP(res) = xexp + 1;
+            NFLOAT_HANDLE_OVERFLOW(res, ctx);
+        }
+    }
+    else
+    {
+        NFLOAT_D(res)[0] = x0;
+        NFLOAT_EXP(res) = xexp;
+        NFLOAT_SGNBIT(res) = xsgnbit;
+    }
+
+    return GR_SUCCESS;
+}
+
+int
+_nfloat_sub_1(nfloat_ptr res, mp_limb_t x0, slong xexp, int xsgnbit, mp_limb_t y0, slong delta, gr_ctx_t ctx)
+{
+    mp_limb_t u;
+    slong norm;
+
+    if (delta == 0)
+    {
+        NFLOAT_SGNBIT(res) = n_signed_sub(&u, x0, y0) ^ xsgnbit;
+
+        if (u == 0)
+            return nfloat_zero(res, ctx);
+    }
+    else if (delta < FLINT_BITS)
+    {
+        u = x0 - (y0 >> delta);
+        NFLOAT_SGNBIT(res) = xsgnbit;
+    }
+    else
+    {
+        NFLOAT_D(res)[0] = x0;
+        NFLOAT_EXP(res) = xexp;
+        NFLOAT_SGNBIT(res) = xsgnbit;
+        return GR_SUCCESS;
+    }
+
+    norm = flint_clz(u);
+    NFLOAT_D(res)[0] = u << norm;
+    NFLOAT_EXP(res) = xexp - norm;
+    NFLOAT_HANDLE_UNDERFLOW(res, ctx);
+    return GR_SUCCESS;
+}
+
+int
+_nfloat_add_n(nfloat_ptr res, mp_srcptr xd, slong xexp, int xsgnbit, mp_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx)
+{
+    slong shift_limbs, shift_bits;
+    mp_limb_t cy;
+    mp_limb_t t[NFLOAT_MAX_LIMBS];
+
+    NFLOAT_SGNBIT(res) = xsgnbit;
+
+    if (delta == 0)
+    {
+        cy = mpn_add_n(NFLOAT_D(res), xd, yd, nlimbs);
+    }
+    else if (delta < FLINT_BITS)
+    {
+        mpn_rshift(t, yd, nlimbs, delta);
+        cy = mpn_add_n(NFLOAT_D(res), xd, t, nlimbs);
+    }
+    else if (delta < nlimbs * FLINT_BITS)
+    {
+        shift_limbs = delta / FLINT_BITS;
+        shift_bits = delta % FLINT_BITS;
+        if (shift_bits == 0)
+            flint_mpn_copyi(t, yd + shift_limbs, nlimbs - shift_limbs);
+        else
+            mpn_rshift(t, yd + shift_limbs, nlimbs - shift_limbs, shift_bits);
+        cy = mpn_add(NFLOAT_D(res), xd, nlimbs, t, nlimbs - shift_limbs);
+    }
+    else
+    {
+        flint_mpn_copyi(NFLOAT_D(res), xd, nlimbs);
+        cy = 0;
+    }
+
+    if (cy == 0)
+    {
+        NFLOAT_EXP(res) = xexp;
+    }
+    else
+    {
+        mpn_rshift(NFLOAT_D(res), NFLOAT_D(res), nlimbs, 1);
+        NFLOAT_D(res)[nlimbs - 1] |= (UWORD(1) << (FLINT_BITS - 1));
+        NFLOAT_EXP(res) = xexp + 1;
+        NFLOAT_HANDLE_OVERFLOW(res, ctx);
+    }
+
+    return GR_SUCCESS;
+}
+
+int
+_nfloat_sub_n(nfloat_ptr res, mp_srcptr xd, slong xexp, int xsgnbit, mp_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx)
+{
+    slong shift_limbs, shift_bits, n, norm;
+    mp_limb_t t[NFLOAT_MAX_LIMBS];
+
+    if (delta == 0)
+    {
+        NFLOAT_SGNBIT(res) = flint_mpn_signed_sub_n(NFLOAT_D(res), xd, yd, nlimbs) ^ xsgnbit;
+    }
+    else
+    {
+        NFLOAT_SGNBIT(res) = xsgnbit;
+
+        if (delta < FLINT_BITS)
+        {
+            mpn_rshift(t, yd, nlimbs, delta);
+            mpn_sub_n(NFLOAT_D(res), xd, t, nlimbs);
+            NFLOAT_SGNBIT(res) = xsgnbit;
+        }
+        else if (delta < nlimbs * FLINT_BITS)
+        {
+            shift_limbs = delta / FLINT_BITS;
+            shift_bits = delta % FLINT_BITS;
+            if (shift_bits == 0)
+                flint_mpn_copyi(t, yd + shift_limbs, nlimbs - shift_limbs);
+            else
+                mpn_rshift(t, yd + shift_limbs, nlimbs - shift_limbs, shift_bits);
+            mpn_sub(NFLOAT_D(res), xd, nlimbs, t, nlimbs - shift_limbs);
+            NFLOAT_SGNBIT(res) = xsgnbit;
+        }
+        else
+        {
+            flint_mpn_copyi(NFLOAT_D(res), xd, nlimbs);
+            NFLOAT_EXP(res) = xexp;
+            NFLOAT_SGNBIT(res) = xsgnbit;
+            return GR_SUCCESS;
+        }
+    }
+
+    n = nlimbs;
+    MPN_NORM(NFLOAT_D(res), n);
+
+    if (n != nlimbs)
+    {
+        if (n == 0)
+            return nfloat_zero(res, ctx);
+
+        norm = flint_clz(NFLOAT_D(res)[n - 1]);
+        if (norm)
+            mpn_lshift(NFLOAT_D(res) + nlimbs - n, NFLOAT_D(res), n, norm);
+        else
+            flint_mpn_copyd(NFLOAT_D(res) + nlimbs - n, NFLOAT_D(res), n);
+
+        xexp -= (nlimbs - n) * FLINT_BITS + norm;
+    }
+    else
+    {
+        norm = flint_clz(NFLOAT_D(res)[nlimbs - 1]);
+        if (norm)
+            mpn_lshift(NFLOAT_D(res), NFLOAT_D(res), nlimbs, norm);
+        xexp -= norm;
+    }
+
+    NFLOAT_EXP(res) = xexp;
+    NFLOAT_HANDLE_UNDERFLOW(res, ctx);
+
+    return GR_SUCCESS;
+}
+
+int
+nfloat_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    slong xexp, yexp, delta, nlimbs;
+    int xsgnbit, ysgnbit;
+
+    if (NFLOAT_IS_SPECIAL(x) || NFLOAT_IS_SPECIAL(y))
+    {
+        if (NFLOAT_IS_ZERO(x))
+            return nfloat_set(res, y, ctx);
+        if (NFLOAT_IS_ZERO(y))
+            return nfloat_set(res, x, ctx);
+        if (NFLOAT_IS_POS_INF(x) && NFLOAT_IS_POS_INF(y))
+            return nfloat_pos_inf(res, ctx);
+        if (NFLOAT_IS_NEG_INF(x) && NFLOAT_IS_NEG_INF(y))
+            return nfloat_neg_inf(res, ctx);
+        return nfloat_nan(res, ctx);
+    }
+
+    nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+
+    xexp = NFLOAT_EXP(x);
+    yexp = NFLOAT_EXP(y);
+
+    if (xexp < yexp)
+    {
+        FLINT_SWAP(nfloat_srcptr, x, y);
+        FLINT_SWAP(slong, xexp, yexp);
+    }
+
+    xsgnbit = NFLOAT_SGNBIT(x);
+    ysgnbit = NFLOAT_SGNBIT(y);
+
+    delta = xexp - yexp;
+
+    if (xsgnbit == ysgnbit)
+    {
+        if (nlimbs == 1)
+            return _nfloat_add_1(res, NFLOAT_D(x)[0], xexp, xsgnbit, NFLOAT_D(y)[0], delta, ctx);
+        else
+            return _nfloat_add_n(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, nlimbs, ctx);
+    }
+    else
+    {
+        if (nlimbs == 1)
+            return _nfloat_sub_1(res, NFLOAT_D(x)[0], xexp, xsgnbit, NFLOAT_D(y)[0], delta, ctx);
+        else
+            return _nfloat_sub_n(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, nlimbs, ctx);
+    }
+}
+
+int
+nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    slong xexp, yexp, delta, nlimbs;
+    int xsgnbit, ysgnbit;
+
+    if (NFLOAT_IS_SPECIAL(x) || NFLOAT_IS_SPECIAL(y))
+    {
+        if (NFLOAT_IS_ZERO(x))
+            return nfloat_neg(res, y, ctx);
+        if (NFLOAT_IS_ZERO(y))
+            return nfloat_set(res, x, ctx);
+        if (NFLOAT_IS_POS_INF(x) && NFLOAT_IS_NEG_INF(y))
+            return nfloat_pos_inf(res, ctx);
+        if (NFLOAT_IS_NEG_INF(x) && NFLOAT_IS_POS_INF(y))
+            return nfloat_neg_inf(res, ctx);
+        return nfloat_nan(res, ctx);
+    }
+
+    nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+
+    xexp = NFLOAT_EXP(x);
+    yexp = NFLOAT_EXP(y);
+
+    xsgnbit = NFLOAT_SGNBIT(x);
+    ysgnbit = NFLOAT_SGNBIT(y) ^ 1;
+
+    if (xexp < yexp)
+    {
+        FLINT_SWAP(nfloat_srcptr, x, y);
+        FLINT_SWAP(slong, xexp, yexp);
+        FLINT_SWAP(int, xsgnbit, ysgnbit);
+    }
+
+    delta = xexp - yexp;
+
+    if (xsgnbit == ysgnbit)
+    {
+        if (nlimbs == 1)
+            return _nfloat_add_1(res, NFLOAT_D(x)[0], xexp, xsgnbit, NFLOAT_D(y)[0], delta, ctx);
+        else
+            return _nfloat_add_n(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, nlimbs, ctx);
+    }
+    else
+    {
+        if (nlimbs == 1)
+            return _nfloat_sub_1(res, NFLOAT_D(x)[0], xexp, xsgnbit, NFLOAT_D(y)[0], delta, ctx);
+        else
+            return _nfloat_sub_n(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, nlimbs, ctx);
+    }
+}
+
+int
+nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    mp_limb_pair_t mul_res;
+    slong nlimbs;
+
+    if (NFLOAT_IS_SPECIAL(x) || NFLOAT_IS_SPECIAL(y))
+    {
+        if (NFLOAT_CTX_FLAGS(ctx) & (NFLOAT_ALLOW_INF | NFLOAT_ALLOW_NAN))
+        {
+            if (NFLOAT_IS_ZERO(x) && !NFLOAT_IS_SPECIAL(y))
+                return nfloat_zero(res, ctx);
+            if (NFLOAT_IS_ZERO(y) && !NFLOAT_IS_SPECIAL(x))
+                return nfloat_zero(res, ctx);
+            return GR_UNABLE;
+        }
+        else
+        {
+            return nfloat_zero(res, ctx);
+        }
+    }
+
+    nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+
+    if (nlimbs == 1)
+    {
+        mp_limb_t hi, lo;
+
+        umul_ppmm(hi, lo, NFLOAT_D(x)[0], NFLOAT_D(y)[0]);
+
+        if (LIMB_MSB_IS_SET(hi))
+        {
+            NFLOAT_D(res)[0] = hi;
+            NFLOAT_EXP(res) = NFLOAT_EXP(x) + NFLOAT_EXP(y);
+        }
+        else
+        {
+            NFLOAT_D(res)[0] = (hi << 1) | (lo >> (FLINT_BITS - 1));
+            NFLOAT_EXP(res) = NFLOAT_EXP(x) + NFLOAT_EXP(y) - 1;
+        }
+    }
+    else if (nlimbs == 2)
+    {
+        mp_limb_t r3, r2, r1, FLINT_SET_BUT_UNUSED(r0);
+
+        FLINT_MPN_MUL_2X2(r3, r2, r1, r0, NFLOAT_D(x)[1], NFLOAT_D(x)[0], NFLOAT_D(y)[1], NFLOAT_D(y)[0]);
+
+        if (LIMB_MSB_IS_SET(r3))
+        {
+            NFLOAT_D(res)[0] = r2;
+            NFLOAT_D(res)[1] = r3;
+            NFLOAT_EXP(res) = NFLOAT_EXP(x) + NFLOAT_EXP(y);
+        }
+        else
+        {
+            NFLOAT_D(res)[0] = (r2 << 1) | (r1 >> (FLINT_BITS - 1));
+            NFLOAT_D(res)[1] = (r3 << 1) | (r2 >> (FLINT_BITS - 1));
+            NFLOAT_EXP(res) = NFLOAT_EXP(x) + NFLOAT_EXP(y) - 1;
+        }
+    }
+    else
+    {
+        mul_res = flint_mpn_mulhigh_normalised2(NFLOAT_D(res), NFLOAT_D(x), NFLOAT_D(y), NFLOAT_CTX_NLIMBS(ctx));
+        NFLOAT_EXP(res) = NFLOAT_EXP(x) + NFLOAT_EXP(y) - mul_res.m2;
+    }
+
+    NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x) ^ NFLOAT_SGNBIT(y);
+
+    NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+    return GR_SUCCESS;
+}
+
+int
+nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    mpfr_t rf, xf, yf;
+    slong prec = NFLOAT_CTX_PREC(ctx);
+
+    if (NFLOAT_IS_SPECIAL(x) || NFLOAT_IS_SPECIAL(y))
+    {
+        if (NFLOAT_IS_ZERO(x) && !NFLOAT_IS_SPECIAL(y))
+            return nfloat_zero(res, ctx);
+        else
+            return nfloat_nan(res, ctx); /* todo */
+    }
+
+    rf->_mpfr_d = NFLOAT_D(res);
+    rf->_mpfr_prec = prec;
+    rf->_mpfr_sign = 1;
+    rf->_mpfr_exp = 0;
+
+    xf->_mpfr_d = NFLOAT_D(x);
+    xf->_mpfr_prec = prec;
+    xf->_mpfr_sign = 1;
+    xf->_mpfr_exp = 0;
+
+    yf->_mpfr_d = NFLOAT_D(y);
+    yf->_mpfr_prec = prec;
+    yf->_mpfr_sign = 1;
+    yf->_mpfr_exp = 0;
+
+    mpfr_div(rf, xf, yf, MPFR_RNDD);
+
+    NFLOAT_EXP(res) = NFLOAT_EXP(x) - NFLOAT_EXP(y) + rf->_mpfr_exp;
+    NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x) ^ NFLOAT_SGNBIT(y);
+
+    NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+    return GR_SUCCESS;
+}
+
+/* check if mpfr_div_ui argument is ulong */
+#if ULONG_MAX == UWORD_MAX
+
+int
+nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
+{
+    mpfr_t rf, xf;
+    slong prec = NFLOAT_CTX_PREC(ctx);
+
+    if (NFLOAT_IS_SPECIAL(x) || y == 0)
+    {
+        if (NFLOAT_IS_ZERO(x) && y != 0)
+            return nfloat_zero(res, ctx);
+        else
+            return nfloat_nan(res, ctx); /* todo */
+    }
+
+    rf->_mpfr_d = NFLOAT_D(res);
+    rf->_mpfr_prec = prec;
+    rf->_mpfr_sign = 1;
+    rf->_mpfr_exp = 0;
+
+    xf->_mpfr_d = NFLOAT_D(x);
+    xf->_mpfr_prec = prec;
+    xf->_mpfr_sign = 1;
+    xf->_mpfr_exp = 0;
+
+    mpfr_div_ui(rf, xf, y, MPFR_RNDD);
+
+    NFLOAT_EXP(res) = NFLOAT_EXP(x) + rf->_mpfr_exp;
+    NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x);
+
+    NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+    return GR_SUCCESS;
+}
+
+int
+nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
+{
+    mpfr_t rf, xf;
+    slong prec = NFLOAT_CTX_PREC(ctx);
+
+    if (NFLOAT_IS_SPECIAL(x) || y == 0)
+    {
+        if (NFLOAT_IS_ZERO(x) && y != 0)
+            return nfloat_zero(res, ctx);
+        else
+            return nfloat_nan(res, ctx); /* todo */
+    }
+
+    rf->_mpfr_d = NFLOAT_D(res);
+    rf->_mpfr_prec = prec;
+    rf->_mpfr_sign = 1;
+    rf->_mpfr_exp = 0;
+
+    xf->_mpfr_d = NFLOAT_D(x);
+    xf->_mpfr_prec = prec;
+    xf->_mpfr_sign = 1;
+    xf->_mpfr_exp = 0;
+
+    mpfr_div_si(rf, xf, y, MPFR_RNDD);
+
+    NFLOAT_EXP(res) = NFLOAT_EXP(x) + rf->_mpfr_exp;
+    NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x) ^ (y < 0);
+
+    NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+    return GR_SUCCESS;
+}
+
+#else
+
+int
+nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
+{
+    arf_t t;
+    int status;
+
+    arf_init(t);
+    nfloat_get_arf(t, x, ctx);
+    arf_div_ui(t, t, y, NFLOAT_CTX_PREC(ctx), NFLOAT_CTX_RND(ctx));
+    status = nfloat_set_arf(res, t, ctx);
+    arf_clear(t);
+
+    NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+    return status;
+}
+
+int
+nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
+{
+    arf_t t;
+    int status;
+
+    arf_init(t);
+    nfloat_get_arf(t, x, ctx);
+    arf_div_si(t, t, y, NFLOAT_CTX_PREC(ctx), NFLOAT_CTX_RND(ctx));
+    status = nfloat_set_arf(res, t, ctx);
+    arf_clear(t);
+
+    NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+    return status;
+}
+
+#endif
diff --git a/src/nfloat/test/main.c b/src/nfloat/test/main.c
new file mode 100644
index 0000000000..2bce47d042
--- /dev/null
+++ b/src/nfloat/test/main.c
@@ -0,0 +1,28 @@
+/*
+    Copyright (C) 2023 Albin Ahlbäck
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include <string.h>
+#include <stdlib.h>
+
+/* Include functions *********************************************************/
+
+#include "t-nfloat.c"
+
+/* Array of test functions ***************************************************/
+
+test_struct tests[] =
+{
+    TEST_FUNCTION(nfloat),
+};
+
+/* main function *************************************************************/
+
+TEST_MAIN(tests)
diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c
new file mode 100644
index 0000000000..b9d629e101
--- /dev/null
+++ b/src/nfloat/test/t-nfloat.c
@@ -0,0 +1,166 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "test_helpers.h"
+#include "gr_vec.h"
+#include "arf.h"
+#include "nfloat.h"
+
+void
+nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
+{
+    slong iter, i, len, prec;
+    gr_ptr s0, vec1, vec2, res;
+    int subtract, initial, reverse;
+    arf_ptr avec1, avec2;
+    arf_t as0, ares, ares2, amag, err, t;
+
+    prec = NFLOAT_CTX_PREC(ctx);
+
+    for (iter = 0; iter < iters; iter++)
+    {
+        len = n_randint(state, 30);
+
+        initial = n_randint(state, 2);
+        subtract = n_randint(state, 2);
+        reverse = n_randint(state, 2);
+
+        vec1 = gr_heap_init_vec(len, ctx);
+        vec2 = gr_heap_init_vec(len, ctx);
+        s0 = gr_heap_init(ctx);
+        res = gr_heap_init(ctx);
+
+        avec1 = _arf_vec_init(len);
+        avec2 = _arf_vec_init(len);
+        arf_init(as0);
+        arf_init(ares);
+        arf_init(ares2);
+        arf_init(amag);
+        arf_init(t);
+        arf_init(err);
+
+        if (initial)
+        {
+            arf_randtest(as0, state, prec, 10);
+            GR_MUST_SUCCEED(nfloat_set_arf(s0, as0, ctx));
+
+            arf_set(ares, as0);
+            arf_abs(t, as0);
+            arf_add(amag, amag, t, prec, ARF_RND_DOWN);
+        }
+
+        for (i = 0; i < len; i++)
+        {
+            if (i > 0 && n_randint(state, 2))
+            {
+                arf_set(avec1 + i, avec1 + len - 1 - i);
+                arf_neg(avec2 + i, avec2 + len - 1 - i);
+            }
+            else
+            {
+                arf_randtest(avec1 + i, state, prec, 10);
+                arf_randtest(avec2 + i, state, prec, 10);
+            }
+        }
+
+        for (i = 0; i < len; i++)
+        {
+            arf_mul(t, avec1 + i, avec2 + (reverse ? len - 1 - i : i), 2 * prec, ARF_RND_DOWN);
+
+            if (subtract)
+                arf_sub(ares, ares, t, 2 * prec, ARF_RND_DOWN);
+            else
+                arf_add(ares, ares, t, 2 * prec, ARF_RND_DOWN);
+
+            arf_abs(t, t);
+            arf_add(amag, amag, t, prec, ARF_RND_DOWN);
+        }
+
+        /* tolerance */
+        arf_mul_2exp_si(t, amag, -prec + 3);
+
+        for (i = 0; i < len; i++)
+        {
+            GR_MUST_SUCCEED(nfloat_set_arf(GR_ENTRY(vec1, i, ctx->sizeof_elem), avec1 + i, ctx));
+            GR_MUST_SUCCEED(nfloat_set_arf(GR_ENTRY(vec2, i, ctx->sizeof_elem), avec2 + i, ctx));
+        }
+
+        if (reverse)
+            GR_MUST_SUCCEED(_nfloat_vec_dot_rev(res, initial ? s0 : NULL, subtract, vec1, vec2, len, ctx));
+        else
+            GR_MUST_SUCCEED(_nfloat_vec_dot(res, initial ? s0 : NULL, subtract, vec1, vec2, len, ctx));
+
+        GR_MUST_SUCCEED(nfloat_get_arf(ares2, res, ctx));
+
+        arf_sub(err, ares, ares2, prec, ARF_RND_DOWN);
+        arf_abs(err, err);
+
+        if (arf_cmpabs(err, t) > 0)
+        {
+            flint_printf("FAIL: dot\n");
+            gr_ctx_println(ctx);
+
+            flint_printf("reverse = %d, subtract = %d\n", reverse, subtract);
+
+            if (initial)
+            {
+                flint_printf("\n\ninitial = ");
+                arf_printd(as0, 2 + prec / 3.33);
+            }
+
+            flint_printf("\n\nvec1 = ");
+            _gr_vec_print(vec1, len, ctx);
+            flint_printf("\n\nvec2 = ");
+            _gr_vec_print(vec2, len, ctx);
+            flint_printf("\n\nares = \n");
+            arf_printd(ares, 2 + prec / 3.33);
+            flint_printf("\n\nares2 = \n");
+            arf_printd(ares2, 2 + prec / 3.33);
+            flint_printf("\n\ntol = \n");
+            arf_printd(t, 10);
+            flint_printf("\n\nerr = \n");
+            arf_printd(err, 10);
+            flint_printf("\n\n");
+
+            flint_abort();
+        }
+
+        gr_heap_clear_vec(vec1, len, ctx);
+        gr_heap_clear_vec(vec2, len, ctx);
+        gr_heap_clear(s0, ctx);
+        gr_heap_clear(res, ctx);
+
+        _arf_vec_clear(avec1, len);
+        _arf_vec_clear(avec2, len);
+        arf_clear(as0);
+        arf_clear(ares);
+        arf_clear(ares2);
+        arf_clear(amag);
+        arf_clear(t);
+        arf_clear(err);
+    }
+}
+
+TEST_FUNCTION_START(nfloat, state)
+{
+    gr_ctx_t ctx;
+    slong prec;
+
+    for (prec = NFLOAT_MIN_LIMBS * FLINT_BITS; prec <= NFLOAT_MAX_LIMBS * FLINT_BITS; prec += FLINT_BITS)
+    {
+        nfloat_ctx_init(ctx, prec, 0);
+        gr_test_floating_point(ctx, 100 * flint_test_multiplier(), 0);
+        nfloat_test_dot(state, (prec <= 128 ? 10000 : 100) * flint_test_multiplier(), ctx);
+        gr_ctx_clear(ctx);
+    }
+
+    TEST_FUNCTION_END(state);
+}

From 47d37c2ae5ee1ab156ee0a9894b886cc56f3fbb8 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Mon, 6 May 2024 15:48:20 +0200
Subject: [PATCH 02/12] small fixes

---
 src/nfloat/nfloat.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c
index a609e893bc..2c2b3ef6e7 100644
--- a/src/nfloat/nfloat.c
+++ b/src/nfloat/nfloat.c
@@ -937,7 +937,7 @@ nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
     yf->_mpfr_sign = 1;
     yf->_mpfr_exp = 0;
 
-    mpfr_div(rf, xf, yf, MPFR_RNDD);
+    mpfr_div(rf, xf, yf, MPFR_RNDZ);
 
     NFLOAT_EXP(res) = NFLOAT_EXP(x) - NFLOAT_EXP(y) + rf->_mpfr_exp;
     NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x) ^ NFLOAT_SGNBIT(y);
@@ -973,7 +973,7 @@ nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
     xf->_mpfr_sign = 1;
     xf->_mpfr_exp = 0;
 
-    mpfr_div_ui(rf, xf, y, MPFR_RNDD);
+    mpfr_div_ui(rf, xf, y, MPFR_RNDZ);
 
     NFLOAT_EXP(res) = NFLOAT_EXP(x) + rf->_mpfr_exp;
     NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x);
@@ -1006,7 +1006,7 @@ nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
     xf->_mpfr_sign = 1;
     xf->_mpfr_exp = 0;
 
-    mpfr_div_si(rf, xf, y, MPFR_RNDD);
+    mpfr_div_si(rf, xf, y, MPFR_RNDZ);
 
     NFLOAT_EXP(res) = NFLOAT_EXP(x) + rf->_mpfr_exp;
     NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x) ^ (y < 0);
@@ -1025,7 +1025,7 @@ nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
 
     arf_init(t);
     nfloat_get_arf(t, x, ctx);
-    arf_div_ui(t, t, y, NFLOAT_CTX_PREC(ctx), NFLOAT_CTX_RND(ctx));
+    arf_div_ui(t, t, y, NFLOAT_CTX_PREC(ctx), ARF_RND_DOWN);
     status = nfloat_set_arf(res, t, ctx);
     arf_clear(t);
 
@@ -1041,7 +1041,7 @@ nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
 
     arf_init(t);
     nfloat_get_arf(t, x, ctx);
-    arf_div_si(t, t, y, NFLOAT_CTX_PREC(ctx), NFLOAT_CTX_RND(ctx));
+    arf_div_si(t, t, y, NFLOAT_CTX_PREC(ctx), ARF_RND_DOWN);
     status = nfloat_set_arf(res, t, ctx);
     arf_clear(t);
 

From d5bcdd1116fb562c0f688db3fdf245c2f97df0cc Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Mon, 6 May 2024 15:55:24 +0200
Subject: [PATCH 03/12] hack to allow mul aliasing test to pass for squaring

---
 src/gr/test_ring.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/gr/test_ring.c b/src/gr/test_ring.c
index 16036c9d16..430d3fd7ef 100644
--- a/src/gr/test_ring.c
+++ b/src/gr/test_ring.c
@@ -37,6 +37,14 @@ gr_test_binary_op_aliasing(gr_ctx_t R, int (*gr_op)(gr_ptr, gr_srcptr, gr_srcptr
     status |= gr_op(xy1, x, y, R);
 
     alias = n_randint(state, 4);
+
+    /* Don' test x * x == x^2 for inexact "rings" (e.g. floats) where
+       the squaring algorithm might not produce exactly the same result. */
+    if (alias == 2 && gr_ctx_is_ring(R) == T_FALSE && gr_ctx_is_exact(R) == T_FALSE)
+    {
+        alias = 3;
+    }
+
     switch (alias)
     {
         case 0:

From ec34de275c47c58013a4b5fadd04c709b6e0502e Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Mon, 6 May 2024 19:57:57 +0200
Subject: [PATCH 04/12] two-limb dot

---
 src/gr/test_ring.c         |   6 +-
 src/nfloat/dot.c           | 202 ++++++++++++++++++++++++++++++++++++-
 src/nfloat/test/t-nfloat.c |  13 ++-
 3 files changed, 213 insertions(+), 8 deletions(-)

diff --git a/src/gr/test_ring.c b/src/gr/test_ring.c
index 430d3fd7ef..06628c81b5 100644
--- a/src/gr/test_ring.c
+++ b/src/gr/test_ring.c
@@ -38,11 +38,11 @@ gr_test_binary_op_aliasing(gr_ctx_t R, int (*gr_op)(gr_ptr, gr_srcptr, gr_srcptr
 
     alias = n_randint(state, 4);
 
-    /* Don' test x * x == x^2 for inexact "rings" (e.g. floats) where
+    /* Don't test x * x == x^2 for inexact "rings" (e.g. floats) where
        the squaring algorithm might not produce exactly the same result. */
-    if (alias == 2 && gr_ctx_is_ring(R) == T_FALSE && gr_ctx_is_exact(R) == T_FALSE)
+    if ((alias == 2 || alias == 3) && gr_ctx_is_ring(R) == T_FALSE && gr_ctx_is_exact(R) == T_FALSE)
     {
-        alias = 3;
+        alias -= 2;
     }
 
     switch (alias)
diff --git a/src/nfloat/dot.c b/src/nfloat/dot.c
index 546e032cf5..b2596378be 100644
--- a/src/nfloat/dot.c
+++ b/src/nfloat/dot.c
@@ -22,7 +22,7 @@
 #define FLINT_MPN_MUL_2X2H(r3, r2, r1, a1, a0, b1, b0)   \
     do {                                                                  \
         mp_limb_t __t1, __t2, __u1, __u2, __v3, __v2;                     \
-        mp_limb_t __r3, __r2, __r1, __r0;                                 \
+        mp_limb_t __r3, __r2, __r1;                                       \
         mp_limb_t __a1 = (a1), __a0 = (a0), __b1 = (b1), __b0 = (b0);     \
         umul_ppmm(__t2, __t1, __a0, __b1);                                \
         umul_ppmm(__u2, __u1, __a1, __b0);                                \
@@ -107,6 +107,7 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
 
                 xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
                 delta = sexp - xexp;
+                FLINT_ASSERT(delta != 0);
 
                 if (delta < FLINT_BITS)
                 {
@@ -192,6 +193,203 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
         return GR_SUCCESS;
     }
 
+    if (NFLOAT_CTX_NLIMBS(ctx) == 2 && !(NFLOAT_CTX_FLAGS(ctx) & (NFLOAT_ALLOW_INF | NFLOAT_ALLOW_NAN)))
+    {
+        slong i, xexp, delta, sexp, norm, pad_bits;
+        int xsgnbit;
+        nfloat_srcptr xi, yi;
+        mp_limb_t s0, s1, s2;
+        mp_limb_t t0, t1, t2, t3;
+
+        s0 = s1 = s2 = 0;
+        sexp = WORD_MIN;
+
+        if (initial != NULL && !NFLOAT_IS_ZERO(initial))
+            sexp = NFLOAT_EXP(initial);
+
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
+        {
+            if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
+            {
+                DUMMY_OP;
+                continue;
+            }
+
+            xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+            sexp = FLINT_MAX(sexp, xexp);
+        }
+
+        if (sexp == WORD_MIN)
+            return nfloat_zero(res, ctx);
+
+        pad_bits = FLINT_BIT_COUNT(len + 1) + 1;
+        sexp += pad_bits;
+
+        if (initial != NULL && !NFLOAT_IS_ZERO(initial))
+        {
+            xexp = NFLOAT_EXP(initial);
+            xsgnbit = NFLOAT_SGNBIT(initial) ^ subtract;
+            delta = sexp - xexp;
+            FLINT_ASSERT(delta != 0);
+
+            if (delta < 3 * FLINT_BITS)
+            {
+                if (delta < FLINT_BITS)
+                {
+                    s0 = 0;
+                    s1 = NFLOAT_D(initial)[0];
+                    s2 = NFLOAT_D(initial)[1];
+                }
+                else if (delta < 2 * FLINT_BITS)
+                {
+                    s0 = NFLOAT_D(initial)[0];
+                    s1 = NFLOAT_D(initial)[1];
+                    s2 = 0;
+
+                    delta -= FLINT_BITS;
+                }
+                else
+                {
+                    s0 = NFLOAT_D(initial)[1];
+                    s1 = 0;
+                    s2 = 0;
+
+                    delta -= 2 * FLINT_BITS;
+                }
+
+                if (delta != 0)
+                {
+                    s0 = (s0 >> delta) | s1 << (FLINT_BITS - delta);
+                    s1 = (s1 >> delta) | (s2 << (FLINT_BITS - delta));
+                    s2 = s2 >> delta;
+                }
+
+                if (xsgnbit)
+                    sub_dddmmmsss(s2, s1, s0, 0, 0, 0, s2, s1, s0);
+            }
+        }
+
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
+        {
+            if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
+            {
+                DUMMY_OP;
+                continue;
+            }
+
+            xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+            xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
+            delta = sexp - xexp;
+            FLINT_ASSERT(delta != 0);
+
+            if (delta < FLINT_BITS)
+            {
+                xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
+
+#if 1
+                FLINT_MPN_MUL_2X2H(t3, t2, t1, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]);
+                (void) t0;
+#else
+                FLINT_MPN_MUL_2X2(t3, t2, t1, t0, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]);
+#endif
+
+                if (xsgnbit)
+                    sub_dddmmmsss(s2, s1, s0, s2, s1, s0, t3 >> delta, (t2 >> delta) | (t3 << (FLINT_BITS - delta)), (t1 >> delta) | (t2 << (FLINT_BITS - delta)));
+                else
+                    add_sssaaaaaa(s2, s1, s0, s2, s1, s0, t3 >> delta, (t2 >> delta) | (t3 << (FLINT_BITS - delta)), (t1 >> delta) | (t2 << (FLINT_BITS - delta)));
+            }
+            else if (delta < 3 * FLINT_BITS)
+            {
+                if (delta < 2 * FLINT_BITS)
+                {
+                    FLINT_MPN_MUL_2X2H(t3, t2, t1, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]);
+
+                    delta -= FLINT_BITS;
+
+                    if (delta != 0)
+                    {
+                        t2 = (t2 >> delta) | (t3 << (FLINT_BITS - delta));
+                        t3 = t3 >> delta;
+                    }
+                }
+                else
+                {
+                    umul_ppmm(t3, t2, NFLOAT_D(xi)[1], NFLOAT_D(yi)[1]);
+
+                    delta -= 2 * FLINT_BITS;
+
+                    if (delta != 0)
+                        t3 = t3 >> delta;
+
+                    t2 = t3;
+                    t3 = 0;
+                }
+
+                if (xsgnbit)
+                    sub_dddmmmsss(s2, s1, s0, s2, s1, s0, 0, t3, t2);
+                else
+                    add_sssaaaaaa(s2, s1, s0, s2, s1, s0, 0, t3, t2);
+            }
+        }
+
+        if (LIMB_MSB_IS_SET(s2))
+        {
+            sub_dddmmmsss(s2, s1, s0, 0, 0, 0, s2, s1, s0);
+            xsgnbit = 1;
+        }
+        else
+        {
+            xsgnbit = 0;
+        }
+
+        NFLOAT_SGNBIT(res) = xsgnbit ^ subtract;
+
+        if (s2 != 0)
+        {
+            norm = flint_clz(s2);
+            if (norm)
+            {
+                NFLOAT_D(res)[0] = (s1 << norm) | (s0 >> (FLINT_BITS - norm));
+                NFLOAT_D(res)[1] = (s2 << norm) | (s1 >> (FLINT_BITS - norm));
+            }
+            else
+            {
+                NFLOAT_D(res)[0] = s1;
+                NFLOAT_D(res)[1] = s2;
+            }
+            NFLOAT_EXP(res) = sexp - norm;
+        }
+        else if (s1 != 0)
+        {
+            norm = flint_clz(s1);
+            if (norm)
+            {
+                NFLOAT_D(res)[0] = (s0 << norm);
+                NFLOAT_D(res)[1] = (s1 << norm) | (s0 >> (FLINT_BITS - norm));
+            }
+            else
+            {
+                NFLOAT_D(res)[0] = s0;
+                NFLOAT_D(res)[1] = s1;
+            }
+            NFLOAT_EXP(res) = sexp - FLINT_BITS - norm;
+        }
+        else if (s0 != 0)
+        {
+            norm = flint_clz(s0);
+            NFLOAT_D(res)[0] = 0;
+            NFLOAT_D(res)[1] = s0 << norm;
+            NFLOAT_EXP(res) = sexp - 2 * FLINT_BITS - norm;
+        }
+        else
+        {
+            return nfloat_zero(res, ctx);
+        }
+
+        NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+        return GR_SUCCESS;
+    }
+
     if (!(NFLOAT_CTX_FLAGS(ctx) & (NFLOAT_ALLOW_INF | NFLOAT_ALLOW_NAN)))
     {
         slong i, xexp, delta, sexp, norm, pad_bits;
@@ -238,6 +436,8 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
 
             if (delta < FLINT_BITS)
             {
+                FLINT_ASSERT(delta != 0);
+
                 mpn_rshift(s + 1, NFLOAT_D(initial), nlimbs, delta);
                 s[0] = NFLOAT_D(initial)[0] << (FLINT_BITS - delta);
                 if (xsgnbit)
diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c
index b9d629e101..ffb8d61031 100644
--- a/src/nfloat/test/t-nfloat.c
+++ b/src/nfloat/test/t-nfloat.c
@@ -17,7 +17,7 @@
 void
 nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
 {
-    slong iter, i, len, prec;
+    slong iter, i, len, prec, ebits;
     gr_ptr s0, vec1, vec2, res;
     int subtract, initial, reverse;
     arf_ptr avec1, avec2;
@@ -29,6 +29,11 @@ nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
     {
         len = n_randint(state, 30);
 
+        if (n_randint(state, 2))
+            ebits = 1;
+        else
+            ebits = 10;
+
         initial = n_randint(state, 2);
         subtract = n_randint(state, 2);
         reverse = n_randint(state, 2);
@@ -49,7 +54,7 @@ nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
 
         if (initial)
         {
-            arf_randtest(as0, state, prec, 10);
+            arf_randtest(as0, state, prec, ebits);
             GR_MUST_SUCCEED(nfloat_set_arf(s0, as0, ctx));
 
             arf_set(ares, as0);
@@ -66,8 +71,8 @@ nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
             }
             else
             {
-                arf_randtest(avec1 + i, state, prec, 10);
-                arf_randtest(avec2 + i, state, prec, 10);
+                arf_randtest(avec1 + i, state, prec, ebits);
+                arf_randtest(avec2 + i, state, prec, ebits);
             }
         }
 

From 57b5310859c2f114e0d25d53f8a27eeedc034a16 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Mon, 6 May 2024 20:44:33 +0200
Subject: [PATCH 05/12] some profiling code

---
 src/nfloat/dot.c              |  11 ----
 src/nfloat/profile/p-vs_arf.c | 107 ++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 11 deletions(-)
 create mode 100644 src/nfloat/profile/p-vs_arf.c

diff --git a/src/nfloat/dot.c b/src/nfloat/dot.c
index b2596378be..65b549924f 100644
--- a/src/nfloat/dot.c
+++ b/src/nfloat/dot.c
@@ -272,10 +272,7 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
         for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
         {
             if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
-            {
-                DUMMY_OP;
                 continue;
-            }
 
             xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
             xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
@@ -397,12 +394,10 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
         mp_limb_t t[NFLOAT_MAX_LIMBS + 1];
         mp_limb_t s[NFLOAT_MAX_LIMBS + 1];
         nfloat_srcptr xi, yi;
-        mp_limb_t s0, s1;
         slong n;
 
         slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
 
-        s0 = s1 = 0;
         sexp = WORD_MIN;
 
         if (initial != NULL && !NFLOAT_IS_ZERO(initial))
@@ -411,10 +406,7 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
         for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
         {
             if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
-            {
-                DUMMY_OP;
                 continue;
-            }
 
             xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
             sexp = FLINT_MAX(sexp, xexp);
@@ -466,10 +458,7 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
         for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
         {
             if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
-            {
-                DUMMY_OP;
                 continue;
-            }
 
             xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
             xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
diff --git a/src/nfloat/profile/p-vs_arf.c b/src/nfloat/profile/p-vs_arf.c
new file mode 100644
index 0000000000..dc847fe8ec
--- /dev/null
+++ b/src/nfloat/profile/p-vs_arf.c
@@ -0,0 +1,107 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include <stdlib.h>
+#include "fmpz.h"
+#include "gr.h"
+#include "gr_vec.h"
+#include "gr_mat.h"
+#include "arf.h"
+#include "nfloat.h"
+#include "profiler.h"
+#include "double_extras.h"
+
+#if 1
+#undef TIMEIT_END_REPEAT
+#define TIMEIT_END_REPEAT(__timer, __reps) \
+            } \
+            timeit_stop(__timer); \
+            if (__timer->cpu >= 100) \
+                break; \
+            __reps *= 10; \
+        } \
+    } while (0);
+#endif
+
+int main()
+{
+    gr_ptr vec, vec2;
+    gr_ptr x;
+    gr_ctx_t ctx;
+    int which;
+    slong i, n;
+    slong prec;
+    double __, t, arf_tsum = 0.0, arf_tprod = 0.0, arf_tdot = 0.0;
+    double nfloat_tsum = 0.0, nfloat_tprod = 0.0, nfloat_tdot = 0.0;
+
+    flint_printf("                   _gr_vec_sum          _gr_vec_product      _gr_vec_dot\n");
+
+    for (prec = 64; prec <= 2048; prec *= 2)
+    {
+        flint_printf("prec = %wd\n", prec);
+
+        for (n = 3; n <= 100; n = (n == 3) ? 10 : n * 10)
+        {
+            for (which = 0; which < 2; which++)
+            {
+                if (which == 0)
+                    gr_ctx_init_real_float_arf(ctx, prec);
+                else
+                    nfloat_ctx_init(ctx, prec, 0);
+
+                x = gr_heap_init(ctx);
+                vec = gr_heap_init_vec(n, ctx);
+                vec2 = gr_heap_init_vec(n, ctx);
+
+                for (i = 0; i < n; i++)
+                {
+                    GR_MUST_SUCCEED(gr_one(GR_ENTRY(vec, i, ctx->sizeof_elem), ctx));
+                    if (i % 3 == 0)
+                        GR_MUST_SUCCEED(gr_neg(GR_ENTRY(vec, i, ctx->sizeof_elem), GR_ENTRY(vec, i, ctx->sizeof_elem), ctx));
+                    GR_MUST_SUCCEED(gr_div_ui(GR_ENTRY(vec, i, ctx->sizeof_elem), GR_ENTRY(vec, i, ctx->sizeof_elem), 2 * i + 3, ctx));
+                }
+
+                for (i = 0; i < n; i++)
+                {
+                    GR_MUST_SUCCEED(gr_one(GR_ENTRY(vec2, i, ctx->sizeof_elem), ctx));
+                    if (i % 5 == 0)
+                        GR_MUST_SUCCEED(gr_neg(GR_ENTRY(vec2, i, ctx->sizeof_elem), GR_ENTRY(vec2, i, ctx->sizeof_elem), ctx));
+                    GR_MUST_SUCCEED(gr_div_ui(GR_ENTRY(vec2, i, ctx->sizeof_elem), GR_ENTRY(vec2, i, ctx->sizeof_elem), 2 * i + 5, ctx));
+                }
+
+                TIMEIT_START
+                GR_MUST_SUCCEED(_gr_vec_sum(x, vec, n, ctx));
+                TIMEIT_STOP_VALUES(__, t)
+                if (which == 0) arf_tsum = t; else nfloat_tsum = t;
+                (void) __;
+
+                TIMEIT_START
+                GR_MUST_SUCCEED(_gr_vec_product(x, vec, n, ctx));
+                TIMEIT_STOP_VALUES(__, t)
+                if (which == 0) arf_tprod = t; else nfloat_tprod = t;
+
+                TIMEIT_START
+                GR_MUST_SUCCEED(_gr_vec_dot(x, NULL, 0, vec, vec2, n, ctx));
+                TIMEIT_STOP_VALUES(__, t)
+                if (which == 0) arf_tdot = t; else nfloat_tdot = t;
+
+                gr_heap_clear(x, ctx);
+                gr_heap_clear_vec(vec, n, ctx);
+            }
+
+            flint_printf("n = %4wd   ", n);
+            flint_printf("     %.3e (%.3fx)   %.3e (%.3fx)   %.3e (%.3fx)\n",
+                nfloat_tsum, arf_tsum / nfloat_tsum, nfloat_tprod, arf_tprod / nfloat_tprod, nfloat_tdot, arf_tdot / nfloat_tdot);
+        }
+    }
+
+    return 0;
+}

From cb0fe26deb581b9f787b14eaf93d3bd3e2f3b60b Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Tue, 7 May 2024 13:13:19 +0200
Subject: [PATCH 06/12] more nfloat conversions

---
 doc/source/nfloat.rst |  15 +++++-
 src/gr/arf.c          |   6 +++
 src/nfloat.h          |   5 ++
 src/nfloat/ctx.c      |   2 +-
 src/nfloat/nfloat.c   | 121 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/doc/source/nfloat.rst b/doc/source/nfloat.rst
index bc372457d8..fb81ddc34e 100644
--- a/doc/source/nfloat.rst
+++ b/doc/source/nfloat.rst
@@ -7,7 +7,8 @@ This module provides binary floating-point numbers in a flat representation
 with precision in small fixed multiples of the word size
 (64, 128, 192, 256, ... bits on a 64-bit machine). The exponent range is
 close to a full word.
-A number with `n`-limb precision is stored using `n+2` limbs as follows:
+A number with `n`-limb precision is stored as `n+2` contiguous
+limbs as follows:
 
     +---------------+
     | exponent limb |
@@ -21,6 +22,13 @@ A number with `n`-limb precision is stored using `n+2` limbs as follows:
     | mantissa[n-1] |
     +---------------+
 
+For normal (nonzero and finite) values `x`,
+the most significant limb of the mantissa is always normalised
+to have its most significant bit set, and the exponent `e` is the
+unique integer such that `|x| \in [0.5, 1) \cdot 2^e`.
+Special (zero or nonfinite) values are encoded using special
+values of the exponent field, with junk data in the mantissa.
+
 This type has the advantage that floating-point numbers with the
 same precision can be packed together tightly in vectors
 and created on the stack without heap allocation.
@@ -228,6 +236,11 @@ These methods are interchangeable with their ``gr`` counterparts.
 .. function:: int nfloat_set_arf(nfloat_ptr res, const arf_t x, gr_ctx_t ctx)
               int nfloat_get_arf(arf_t res, nfloat_srcptr x, gr_ctx_t ctx)
 
+.. function:: int nfloat_set_fmpq(nfloat_ptr res, const fmpq_t v, gr_ctx_t ctx)
+              int nfloat_set_d(nfloat_ptr res, double x, gr_ctx_t ctx)
+              int nfloat_set_str(nfloat_ptr res, const char * x, gr_ctx_t ctx)
+              int nfloat_set_other(nfloat_ptr res, gr_srcptr x, gr_ctx_t x_ctx, gr_ctx_t ctx)
+
 .. function:: int nfloat_write(gr_stream_t out, nfloat_srcptr x, gr_ctx_t ctx)
               int nfloat_randtest(nfloat_ptr res, flint_rand_t state, gr_ctx_t ctx)
 
diff --git a/src/gr/arf.c b/src/gr/arf.c
index f79ef28840..5343cf2795 100644
--- a/src/gr/arf.c
+++ b/src/gr/arf.c
@@ -17,6 +17,7 @@
 #include "gr_vec.h"
 #include "gr_poly.h"
 #include "gr_generic.h"
+#include "nfloat.h"
 
 typedef struct
 {
@@ -192,6 +193,11 @@ _gr_arf_set_other(arf_t res, gr_srcptr x, gr_ctx_t x_ctx, const gr_ctx_t ctx)
         case GR_CTX_RR_ARB:
             return _gr_arf_set(res, arb_midref((arb_srcptr) x), ctx);
 
+        case GR_CTX_NFLOAT:
+            nfloat_get_arf(res, x, x_ctx);
+            arf_set_round(res, res, ARF_CTX_PREC(ctx), ARF_CTX_RND(ctx));
+            return GR_SUCCESS;
+
         default:
             {
                 gr_ctx_t cctx;
diff --git a/src/nfloat.h b/src/nfloat.h
index eff8a3b894..1e441edaf3 100644
--- a/src/nfloat.h
+++ b/src/nfloat.h
@@ -272,6 +272,11 @@ int nfloat_set_arf(nfloat_ptr res, const arf_t x, gr_ctx_t ctx);
 int nfloat_get_arf(arf_t res, nfloat_srcptr x, gr_ctx_t ctx);
 #endif
 
+int nfloat_set_fmpq(nfloat_ptr res, const fmpq_t v, gr_ctx_t ctx);
+int nfloat_set_d(nfloat_ptr res, double x, gr_ctx_t ctx);
+int nfloat_set_str(nfloat_ptr res, const char * x, gr_ctx_t ctx);
+int nfloat_set_other(nfloat_ptr res, gr_srcptr x, gr_ctx_t x_ctx, gr_ctx_t ctx);
+
 int nfloat_write(gr_stream_t out, nfloat_srcptr x, gr_ctx_t ctx);
 int nfloat_randtest(nfloat_ptr res, flint_rand_t state, gr_ctx_t ctx);
 
diff --git a/src/nfloat/ctx.c b/src/nfloat/ctx.c
index d37319a183..719555a0ac 100644
--- a/src/nfloat/ctx.c
+++ b/src/nfloat/ctx.c
@@ -56,11 +56,11 @@ gr_method_tab_input _nfloat_methods_input[] =
     {GR_METHOD_SET_SI,          (gr_funcptr) nfloat_set_si},
     {GR_METHOD_SET_UI,          (gr_funcptr) nfloat_set_ui},
     {GR_METHOD_SET_FMPZ,        (gr_funcptr) nfloat_set_fmpz},
-/*
     {GR_METHOD_SET_FMPQ,        (gr_funcptr) nfloat_set_fmpq},
     {GR_METHOD_SET_D,           (gr_funcptr) nfloat_set_d},
     {GR_METHOD_SET_STR,         (gr_funcptr) nfloat_set_str},
     {GR_METHOD_SET_OTHER,       (gr_funcptr) nfloat_set_other},
+/*
     {GR_METHOD_GET_FMPZ,        (gr_funcptr) nfloat_get_fmpz},
     {GR_METHOD_GET_FMPQ,        (gr_funcptr) nfloat_get_fmpq},
     {GR_METHOD_GET_UI,          (gr_funcptr) nfloat_get_ui},
diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c
index 2c2b3ef6e7..78215e507a 100644
--- a/src/nfloat/nfloat.c
+++ b/src/nfloat/nfloat.c
@@ -13,7 +13,9 @@
 #include <mpfr.h>
 #include "fmpz.h"
 #include "arf.h"
+#include "arb.h"
 #include "nfloat.h"
+#include "gr_generic.h"
 
 int
 nfloat_write(gr_stream_t out, nfloat_srcptr x, gr_ctx_t ctx)
@@ -244,6 +246,125 @@ nfloat_set_fmpz(nfloat_ptr res, const fmpz_t x, gr_ctx_t ctx)
     }
 }
 
+/* todo: fast code */
+int
+nfloat_set_fmpq(nfloat_ptr res, const fmpq_t v, gr_ctx_t ctx)
+{
+    arf_t t;
+    int status;
+    arf_init(t);
+    arf_set_fmpq(t, v, NFLOAT_CTX_PREC(ctx), ARF_RND_DOWN);
+    status = nfloat_set_arf(res, t, ctx);
+    arf_clear(t);
+    return status;
+}
+
+/* todo: fast code */
+int
+nfloat_set_d(nfloat_ptr res, double x, gr_ctx_t ctx)
+{
+    arf_t t;
+    int status;
+    arf_init(t);
+    arf_set_d(t, x);
+    status = nfloat_set_arf(res, t, ctx);
+    arf_clear(t);
+    return status;
+}
+
+int
+nfloat_set_str(nfloat_ptr res, const char * x, gr_ctx_t ctx)
+{
+    int status;
+
+    arb_t t;
+    arb_init(t);
+
+    if (!arb_set_str(t, x, NFLOAT_CTX_PREC(ctx) + 20))
+    {
+        arf_set_round(arb_midref(t), arb_midref(t), NFLOAT_CTX_PREC(ctx), ARF_RND_NEAR);
+        status = nfloat_set_arf(res, arb_midref(t), ctx);
+    }
+    else
+    {
+        status = gr_generic_set_str_ring_exponents(res, x, ctx);
+    }
+
+    arb_clear(t);
+    return status;
+}
+
+int
+nfloat_set_other(nfloat_ptr res, gr_srcptr x, gr_ctx_t x_ctx, gr_ctx_t ctx)
+{
+    switch (x_ctx->which_ring)
+    {
+        case GR_CTX_NFLOAT:
+            {
+                slong nlimbs, x_nlimbs;
+
+                if (NFLOAT_IS_SPECIAL(x))
+                {
+                    if (NFLOAT_IS_ZERO(x))
+                        return nfloat_zero(res, ctx);
+                    if (NFLOAT_IS_POS_INF(x))
+                        return nfloat_pos_inf(res, ctx);
+                    if (NFLOAT_IS_NEG_INF(x))
+                        return nfloat_neg_inf(res, ctx);
+                    return nfloat_nan(res, ctx);
+                }
+
+                nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+                x_nlimbs = NFLOAT_CTX_NLIMBS(x_ctx);
+
+                NFLOAT_EXP(res) = NFLOAT_EXP(x);
+                NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x);
+
+                if (nlimbs <= x_nlimbs)
+                {
+                    flint_mpn_copyi(NFLOAT_D(res), NFLOAT_D(x) + x_nlimbs - nlimbs, nlimbs);
+                }
+                else
+                {
+                    flint_mpn_zero(NFLOAT_D(res), nlimbs - x_nlimbs);
+                    flint_mpn_copyi(NFLOAT_D(res) + nlimbs - x_nlimbs, NFLOAT_D(x), x_nlimbs);
+                }
+
+                return GR_SUCCESS;
+            }
+
+        case GR_CTX_FMPZ:
+            return nfloat_set_fmpz(res, x, ctx);
+
+        case GR_CTX_FMPQ:
+            return nfloat_set_fmpq(res, x, ctx);
+
+        case GR_CTX_REAL_FLOAT_ARF:
+            return nfloat_set_arf(res, x, ctx);
+
+        case GR_CTX_RR_ARB:
+            return nfloat_set_arf(res, arb_midref((arb_srcptr) x), ctx);
+
+        default:
+            {
+                int status;
+                arf_t t;
+
+                gr_ctx_t arf_ctx;
+                arf_init(t);
+
+                gr_ctx_init_real_float_arf(arf_ctx, NFLOAT_CTX_PREC(ctx));
+                status = gr_set_other(t, x, x_ctx, arf_ctx);
+                if (status == GR_SUCCESS)
+                    status = nfloat_set_arf(res, t, ctx);
+
+                arf_clear(t);
+                gr_ctx_clear(arf_ctx);
+                return status;
+            }
+    }
+}
+
 int
 nfloat_set_arf(nfloat_ptr res, const arf_t x, gr_ctx_t ctx)
 {

From 533d673f5eb3e1774eec90e5fad310e9fe5a9627 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Tue, 7 May 2024 14:38:59 +0200
Subject: [PATCH 07/12] more function wrappers

---
 doc/source/nfloat.rst      |  25 +++++
 src/nfloat.h               |  24 +++++
 src/nfloat/ctx.c           |   7 +-
 src/nfloat/nfloat.c        | 197 +++++++++++++++++++++++++++++++++++++
 src/python/flint_ctypes.py |  32 ++++++
 5 files changed, 280 insertions(+), 5 deletions(-)

diff --git a/doc/source/nfloat.rst b/doc/source/nfloat.rst
index fb81ddc34e..6c202561ee 100644
--- a/doc/source/nfloat.rst
+++ b/doc/source/nfloat.rst
@@ -257,9 +257,34 @@ These methods are interchangeable with their ``gr`` counterparts.
               int nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
               int nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
 
+.. function:: int nfloat_sqrt(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_rsqrt(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+
 .. function:: int nfloat_sgn(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
               int nfloat_im(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
 
+.. function:: int nfloat_floor(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_ceil(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_trunc(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_nint(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+
+.. function:: int nfloat_pow(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+
+.. function:: int nfloat_pi(nfloat_ptr res, gr_ctx_t ctx)
+              int nfloat_exp(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_expm1(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_log(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_log1p(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_sin(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_cos(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_tan(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_sinh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_cosh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_tanh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_atan(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_gamma(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_zeta(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+
 Vector functions
 -------------------------------------------------------------------------------
 
diff --git a/src/nfloat.h b/src/nfloat.h
index 1e441edaf3..67d1b6afe3 100644
--- a/src/nfloat.h
+++ b/src/nfloat.h
@@ -302,6 +302,30 @@ int nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 int nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx);
 int nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx);
 
+int nfloat_sqrt(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_rsqrt(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+
+int nfloat_floor(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_ceil(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_trunc(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_nint(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+
+int nfloat_pi(nfloat_ptr res, gr_ctx_t ctx);
+int nfloat_pow(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
+int nfloat_exp(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_expm1(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_log(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_log1p(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_sin(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_cos(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_tan(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_sinh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_cosh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_tanh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_atan(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_gamma(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+int nfloat_zeta(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
+
 int _nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx);
 int _nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx);
 
diff --git a/src/nfloat/ctx.c b/src/nfloat/ctx.c
index 719555a0ac..d808a0f244 100644
--- a/src/nfloat/ctx.c
+++ b/src/nfloat/ctx.c
@@ -102,9 +102,9 @@ gr_method_tab_input _nfloat_methods_input[] =
     {GR_METHOD_MUL_2EXP_FMPZ,      (gr_funcptr) nfloat_mul_2exp_fmpz},
     {GR_METHOD_SET_FMPZ_2EXP_FMPZ, (gr_funcptr) nfloat_set_fmpz_2exp_fmpz},
     {GR_METHOD_GET_FMPZ_2EXP_FMPZ, (gr_funcptr) nfloat_get_fmpz_2exp_fmpz},
+*/
 
     {GR_METHOD_POW,             (gr_funcptr) nfloat_pow},
-*/
 
 /*
     {GR_METHOD_POW_UI,          (gr_funcptr) nfloat_pow_ui},
@@ -113,7 +113,6 @@ gr_method_tab_input _nfloat_methods_input[] =
     {GR_METHOD_POW_FMPQ,        (gr_funcptr) nfloat_pow_fmpq},
 */
 
-/*
     {GR_METHOD_SQRT,            (gr_funcptr) nfloat_sqrt},
     {GR_METHOD_RSQRT,           (gr_funcptr) nfloat_rsqrt},
 
@@ -127,7 +126,6 @@ gr_method_tab_input _nfloat_methods_input[] =
     {GR_METHOD_CEIL,            (gr_funcptr) nfloat_ceil},
     {GR_METHOD_TRUNC,           (gr_funcptr) nfloat_trunc},
     {GR_METHOD_NINT,            (gr_funcptr) nfloat_nint},
-*/
 
     {GR_METHOD_ABS,             (gr_funcptr) nfloat_abs},
     {GR_METHOD_CONJ,            (gr_funcptr) nfloat_set},
@@ -139,7 +137,7 @@ gr_method_tab_input _nfloat_methods_input[] =
     {GR_METHOD_CMPABS,          (gr_funcptr) nfloat_cmpabs},
 
     {GR_METHOD_I,               (gr_funcptr) gr_not_in_domain},
-/*
+
     {GR_METHOD_PI,              (gr_funcptr) nfloat_pi},
     {GR_METHOD_EXP,             (gr_funcptr) nfloat_exp},
     {GR_METHOD_EXPM1,           (gr_funcptr) nfloat_expm1},
@@ -154,7 +152,6 @@ gr_method_tab_input _nfloat_methods_input[] =
     {GR_METHOD_ATAN,            (gr_funcptr) nfloat_atan},
     {GR_METHOD_GAMMA,            (gr_funcptr) nfloat_gamma},
     {GR_METHOD_ZETA,             (gr_funcptr) nfloat_zeta},
-*/
 
 /*
     {GR_METHOD_VEC_ADD,                 (gr_funcptr) _nfloat_vec_add},
diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c
index 78215e507a..f1bbaed3b7 100644
--- a/src/nfloat/nfloat.c
+++ b/src/nfloat/nfloat.c
@@ -16,6 +16,7 @@
 #include "arb.h"
 #include "nfloat.h"
 #include "gr_generic.h"
+#include "gr_special.h"
 
 int
 nfloat_write(gr_stream_t out, nfloat_srcptr x, gr_ctx_t ctx)
@@ -183,6 +184,9 @@ _nfloat_underflow(nfloat_ptr res, int sgnbit, gr_ctx_t ctx)
 int
 _nfloat_overflow(nfloat_ptr res, int sgnbit, gr_ctx_t ctx)
 {
+    if (!(NFLOAT_CTX_FLAGS(ctx) & NFLOAT_ALLOW_INF))
+        return GR_UNABLE;
+
     return sgnbit ? nfloat_neg_inf(res, ctx) : nfloat_pos_inf(res, ctx);
 }
 
@@ -1171,3 +1175,196 @@ nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
 }
 
 #endif
+
+int
+nfloat_sqrt(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    mpfr_t xf, zf;
+    int odd_exp;
+    slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+    slong prec = NFLOAT_CTX_PREC(ctx);
+
+    if (NFLOAT_IS_SPECIAL(x))
+    {
+        if (NFLOAT_IS_NEG_INF(x))
+            return nfloat_nan(res, ctx);
+        else
+            return nfloat_set(res, x, ctx);
+    }
+
+    if (NFLOAT_SGNBIT(x))
+        return nfloat_nan(res, ctx);
+
+    odd_exp = NFLOAT_EXP(x) & 1;
+
+    /* Powers of two */
+    if (odd_exp &&
+        NFLOAT_D(x)[nlimbs - 1] == (UWORD(1) << (FLINT_BITS - 1)) &&
+        flint_mpn_zero_p(NFLOAT_D(x), nlimbs - 1))
+    {
+        nfloat_set(res, x, ctx);
+        NFLOAT_EXP(res) = (NFLOAT_EXP(res) + 1) / 2;
+        return GR_SUCCESS;
+    }
+
+    xf->_mpfr_d = NFLOAT_D(x);
+    xf->_mpfr_prec = prec;
+    xf->_mpfr_sign = 1;
+    xf->_mpfr_exp = odd_exp;
+
+    if (res == x)
+    {
+        mpfr_sqrt(xf, xf, MPFR_RNDZ);
+    }
+    else
+    {
+        zf->_mpfr_d = NFLOAT_D(res);
+        zf->_mpfr_prec = prec;
+        zf->_mpfr_sign = 1;
+        zf->_mpfr_exp = 0;
+
+        mpfr_sqrt(zf, xf, MPFR_RNDZ);
+    }
+
+    /* floor division */
+    NFLOAT_EXP(res) = (NFLOAT_EXP(x) - (NFLOAT_EXP(x) < 0)) / 2 + zf->_mpfr_exp;
+
+    NFLOAT_SGNBIT(res) = 0;
+
+    return GR_SUCCESS;
+}
+
+int
+nfloat_rsqrt(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    mpfr_t xf, zf;
+    int odd_exp;
+    slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+    slong prec = NFLOAT_CTX_PREC(ctx);
+
+    if (NFLOAT_IS_SPECIAL(x))
+    {
+        if (NFLOAT_IS_ZERO(x))
+            return nfloat_pos_inf(res, ctx);
+        else if (NFLOAT_IS_POS_INF(x))
+            return nfloat_zero(res, ctx);
+        else
+            return nfloat_nan(res, ctx);
+    }
+
+    if (NFLOAT_SGNBIT(x))
+        return nfloat_nan(res, ctx);
+
+    odd_exp = NFLOAT_EXP(x) & 1;
+
+    /* Powers of two */
+    if (odd_exp &&
+        NFLOAT_D(x)[nlimbs - 1] == (UWORD(1) << (FLINT_BITS - 1)) &&
+        flint_mpn_zero_p(NFLOAT_D(x), nlimbs - 1))
+    {
+        nfloat_set(res, x, ctx);
+        NFLOAT_EXP(res) = ((-NFLOAT_EXP(res) + 1)) / 2 + 1;
+        return GR_SUCCESS;
+    }
+
+    xf->_mpfr_d = NFLOAT_D(x);
+    xf->_mpfr_prec = prec;
+    xf->_mpfr_sign = 1;
+    xf->_mpfr_exp = odd_exp;
+
+    if (res == x)
+    {
+        mpfr_rec_sqrt(xf, xf, MPFR_RNDZ);
+    }
+    else
+    {
+        zf->_mpfr_d = NFLOAT_D(res);
+        zf->_mpfr_prec = prec;
+        zf->_mpfr_sign = 1;
+        zf->_mpfr_exp = 0;
+
+        mpfr_rec_sqrt(zf, xf, MPFR_RNDZ);
+    }
+
+    /* floor division */
+    NFLOAT_EXP(res) = -((NFLOAT_EXP(x) - (NFLOAT_EXP(x) < 0)) / 2) + zf->_mpfr_exp;
+
+    NFLOAT_SGNBIT(res) = 0;
+
+    return GR_SUCCESS;
+}
+
+static int
+_nfloat_func1_via_arf(gr_method_unary_op func, nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    gr_ctx_t arf_ctx;
+    arf_t t;
+    int status;
+
+    gr_ctx_init_real_float_arf(arf_ctx, NFLOAT_CTX_PREC(ctx));
+    arf_init(t);
+    nfloat_get_arf(t, x, ctx);
+    status = func(t, t, arf_ctx);
+    if (status == GR_SUCCESS)
+        status = nfloat_set_arf(res, t, ctx);
+    arf_clear(t);
+    gr_ctx_clear(arf_ctx);
+
+    return status;
+}
+
+static int
+_nfloat_func2_via_arf(gr_method_binary_op func, nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+{
+    gr_ctx_t arf_ctx;
+    arf_t t, u;
+    int status;
+
+    gr_ctx_init_real_float_arf(arf_ctx, NFLOAT_CTX_PREC(ctx));
+    arf_init(t);
+    arf_init(u);
+    nfloat_get_arf(t, x, ctx);
+    nfloat_get_arf(u, y, ctx);
+    status = func(t, t, u, arf_ctx);
+    if (status == GR_SUCCESS)
+        status = nfloat_set_arf(res, t, ctx);
+    arf_clear(t);
+    arf_clear(u);
+    gr_ctx_clear(arf_ctx);
+
+    return status;
+}
+
+/* todo: fast code */
+int nfloat_floor(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_floor, res, x, ctx); }
+int nfloat_ceil(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_ceil, res, x, ctx); }
+int nfloat_trunc(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_trunc, res, x, ctx); }
+int nfloat_nint(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_nint, res, x, ctx); }
+
+int nfloat_pi(nfloat_ptr res, gr_ctx_t ctx)
+{
+    slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+
+    FLINT_ASSERT(nlimbs <= ARB_PI4_TAB_LIMBS);
+
+    NFLOAT_EXP(res) = 0;
+    NFLOAT_SGNBIT(res) = 0;
+    flint_mpn_copyi(NFLOAT_D(res), arb_pi4_tab + ARB_PI4_TAB_LIMBS - nlimbs, nlimbs);
+
+    return GR_SUCCESS;
+}
+
+int nfloat_pow(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx) { return _nfloat_func2_via_arf((gr_method_binary_op) gr_pow, res, x, y, ctx); }
+int nfloat_exp(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_exp, res, x, ctx); }
+int nfloat_expm1(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_expm1, res, x, ctx); }
+int nfloat_log(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_log, res, x, ctx); }
+int nfloat_log1p(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_log1p, res, x, ctx); }
+int nfloat_sin(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_sin, res, x, ctx); }
+int nfloat_cos(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_cos, res, x, ctx); }
+int nfloat_tan(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_tan, res, x, ctx); }
+int nfloat_sinh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_sinh, res, x, ctx); }
+int nfloat_cosh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_cosh, res, x, ctx); }
+int nfloat_tanh(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_tanh, res, x, ctx); }
+int nfloat_atan(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_atan, res, x, ctx); }
+int nfloat_gamma(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_gamma, res, x, ctx); }
+int nfloat_zeta(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx) { return _nfloat_func1_via_arf((gr_method_unary_op) gr_zeta, res, x, ctx); }
diff --git a/src/python/flint_ctypes.py b/src/python/flint_ctypes.py
index 8d985f48c6..f68df40d51 100644
--- a/src/python/flint_ctypes.py
+++ b/src/python/flint_ctypes.py
@@ -1,6 +1,7 @@
 import ctypes
 import ctypes.util
 import sys
+import functools
 
 libflint_path = ctypes.util.find_library('flint')
 if libflint_path == None:
@@ -4690,6 +4691,37 @@ def _default_context():
         return CF
 
 
+@functools.cache
+def get_nfloat_class(prec):
+    n = (prec + FLINT_BITS - 1) // FLINT_BITS
+    prec = n * FLINT_BITS
+
+    class _nfloat_struct(ctypes.Structure):
+        _fields_ = [('val', c_ulong * (n + 2))]
+
+    _nfloat_struct.__qualname__ = _nfloat_struct.__name__ = ("nfloat" + str(prec) + "_struct")
+
+    class _nfloat_class(gr_elem):
+        _struct_type = _nfloat_struct
+
+        @staticmethod
+        def _default_context():
+            raise NotImplementedError
+
+    _nfloat_class.__qualname__ = _nfloat_class.__name__ = ("nfloat" + str(prec))
+
+    return _nfloat_class
+
+
+class RealFloat_nfloat(gr_ctx):
+    def __init__(self, prec=128):
+        gr_ctx.__init__(self)
+        libflint.nfloat_ctx_init(self._ref, prec, 0)
+        self._elem_type = get_nfloat_class(prec)
+
+
+
+
 class IntegersMod_nmod(gr_ctx):
     def __init__(self, n, n_is_prime=None):
         n = self._as_ui(n)

From 5c02b1f8b36b1451282aa553b35d298d98ef9143 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Tue, 7 May 2024 15:11:38 +0200
Subject: [PATCH 08/12] basic nfloat_inv; include module in CMake

---
 CMakeLists.txt        |  1 +
 doc/source/nfloat.rst |  3 ++-
 src/nfloat.h          |  1 +
 src/nfloat/ctx.c      |  4 +++-
 src/nfloat/nfloat.c   | 38 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3cef9b34fa..b958959259 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,6 +211,7 @@ set(_BUILD_DIRS
     long_extras
     perm
     double_extras   d_vec           d_mat
+    nfloat
     mpn_extras
     mpfr_vec                        mpfr_mat
     nmod            nmod_vec        nmod_mat        nmod_poly
diff --git a/doc/source/nfloat.rst b/doc/source/nfloat.rst
index 6c202561ee..ef1e45aeac 100644
--- a/doc/source/nfloat.rst
+++ b/doc/source/nfloat.rst
@@ -253,7 +253,8 @@ These methods are interchangeable with their ``gr`` counterparts.
               int nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
               int nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
 
-.. function:: int nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
+.. function:: int nfloat_inv(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+              int nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
               int nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
               int nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
 
diff --git a/src/nfloat.h b/src/nfloat.h
index 67d1b6afe3..eb7c834329 100644
--- a/src/nfloat.h
+++ b/src/nfloat.h
@@ -298,6 +298,7 @@ int nfloat_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 int nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 int nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 
+int nfloat_inv(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
 int nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 int nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx);
 int nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx);
diff --git a/src/nfloat/ctx.c b/src/nfloat/ctx.c
index d808a0f244..69ec78cd36 100644
--- a/src/nfloat/ctx.c
+++ b/src/nfloat/ctx.c
@@ -96,8 +96,10 @@ gr_method_tab_input _nfloat_methods_input[] =
     {GR_METHOD_DIV_SI,          (gr_funcptr) nfloat_div_si},
 /*
     {GR_METHOD_DIV_FMPZ,        (gr_funcptr) nfloat_div_fmpz},
+*/
     {GR_METHOD_INV,             (gr_funcptr) nfloat_inv},
 
+/*
     {GR_METHOD_MUL_2EXP_SI,        (gr_funcptr) nfloat_mul_2exp_si},
     {GR_METHOD_MUL_2EXP_FMPZ,      (gr_funcptr) nfloat_mul_2exp_fmpz},
     {GR_METHOD_SET_FMPZ_2EXP_FMPZ, (gr_funcptr) nfloat_set_fmpz_2exp_fmpz},
@@ -163,9 +165,9 @@ gr_method_tab_input _nfloat_methods_input[] =
     {GR_METHOD_POLY_MULLOW,     (gr_funcptr) nfloat_poly_mullow},
     {GR_METHOD_POLY_ROOTS_OTHER,(gr_funcptr) nfloat_poly_roots_other},
     {GR_METHOD_MAT_MUL,         (gr_funcptr) nfloat_mat_mul},
+*/
     {GR_METHOD_MAT_DET,         (gr_funcptr) gr_mat_det_generic_field},
     {GR_METHOD_MAT_FIND_NONZERO_PIVOT,     (gr_funcptr) gr_mat_find_nonzero_pivot_large_abs},
-*/
 
     {0,                         (gr_funcptr) NULL},
 };
diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c
index f1bbaed3b7..79d9e12792 100644
--- a/src/nfloat/nfloat.c
+++ b/src/nfloat/nfloat.c
@@ -1033,6 +1033,41 @@ nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
     return GR_SUCCESS;
 }
 
+int
+nfloat_inv(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
+{
+    mpfr_t rf, xf;
+    slong prec = NFLOAT_CTX_PREC(ctx);
+
+    if (NFLOAT_IS_SPECIAL(x))
+    {
+        if (NFLOAT_IS_ZERO(x))
+            return nfloat_nan(res, ctx);
+        else
+            return nfloat_nan(res, ctx); /* todo */
+    }
+
+    /* todo: make sure aliasing is correct */
+
+    rf->_mpfr_d = NFLOAT_D(res);
+    rf->_mpfr_prec = prec;
+    rf->_mpfr_sign = 1;
+    rf->_mpfr_exp = 0;
+
+    xf->_mpfr_d = NFLOAT_D(x);
+    xf->_mpfr_prec = prec;
+    xf->_mpfr_sign = 1;
+    xf->_mpfr_exp = 0;
+
+    mpfr_ui_div(rf, 1, xf, MPFR_RNDZ);
+
+    NFLOAT_EXP(res) = -NFLOAT_EXP(x) + rf->_mpfr_exp;
+    NFLOAT_SGNBIT(res) = NFLOAT_SGNBIT(x);
+
+    NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+    return GR_SUCCESS;
+}
+
 int
 nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
 {
@@ -1047,6 +1082,7 @@ nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
             return nfloat_nan(res, ctx); /* todo */
     }
 
+    /* todo: make sure aliasing is correct */
     rf->_mpfr_d = NFLOAT_D(res);
     rf->_mpfr_prec = prec;
     rf->_mpfr_sign = 1;
@@ -1088,6 +1124,7 @@ nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
             return nfloat_nan(res, ctx); /* todo */
     }
 
+    /* todo: make sure aliasing is correct */
     rf->_mpfr_d = NFLOAT_D(res);
     rf->_mpfr_prec = prec;
     rf->_mpfr_sign = 1;
@@ -1121,6 +1158,7 @@ nfloat_div_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
             return nfloat_nan(res, ctx); /* todo */
     }
 
+    /* todo: make sure aliasing is correct */
     rf->_mpfr_d = NFLOAT_D(res);
     rf->_mpfr_prec = prec;
     rf->_mpfr_sign = 1;

From 65eacd4773e784adb376979d5c0a18940b9f8b2a Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Wed, 8 May 2024 23:33:29 +0200
Subject: [PATCH 09/12] acb_zeta: band-aid for huge positive integers

---
 src/acb_dirichlet/zeta.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/acb_dirichlet/zeta.c b/src/acb_dirichlet/zeta.c
index 84950a6241..4d1833ceea 100644
--- a/src/acb_dirichlet/zeta.c
+++ b/src/acb_dirichlet/zeta.c
@@ -23,11 +23,20 @@ void acb_zeta_si(acb_t z, slong s, slong prec);
 static void
 _acb_dirichlet_zeta(acb_t res, const acb_t s, slong prec)
 {
-    acb_t a;
-    acb_init(a);
-    acb_one(a);
-    _acb_poly_zeta_cpx_series(res, s, a, 0, 1, prec);
-    acb_clear(a);
+    /* todo: better asymptotic code, and also for hurwitz zeta */
+    if (acb_is_int(s) && arf_cmp_si(arb_midref(acb_realref(s)), prec) > 0)
+    {
+        acb_one(res);
+        mag_set_ui_2exp_si(arb_radref(acb_realref(res)), 1, -prec);
+    }
+    else
+    {
+        acb_t a;
+        acb_init(a);
+        acb_one(a);
+        _acb_poly_zeta_cpx_series(res, s, a, 0, 1, prec);
+        acb_clear(a);
+    }
 }
 
 /* todo: use euler product for complex s, and check efficiency

From fe673e2c54b3a1e4eb4ca6f85c6ce8616f9288bd Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Wed, 8 May 2024 23:38:02 +0200
Subject: [PATCH 10/12] more nfloat fixes & test code

---
 doc/source/nfloat.rst      |   2 +
 src/gr/arb.c               |  22 ++++
 src/nfloat.h               |   2 +
 src/nfloat/ctx.c           |   2 +-
 src/nfloat/nfloat.c        |  66 +++++++++--
 src/nfloat/test/t-nfloat.c | 234 +++++++++++++++++++++++++++++++++++++
 6 files changed, 315 insertions(+), 13 deletions(-)

diff --git a/doc/source/nfloat.rst b/doc/source/nfloat.rst
index ef1e45aeac..28553e3e87 100644
--- a/doc/source/nfloat.rst
+++ b/doc/source/nfloat.rst
@@ -253,6 +253,8 @@ These methods are interchangeable with their ``gr`` counterparts.
               int nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
               int nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
 
+.. function:: int nfloat_mul_2exp_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
+
 .. function:: int nfloat_inv(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
               int nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
               int nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx)
diff --git a/src/gr/arb.c b/src/gr/arb.c
index 340fc2c8fd..abb70446d8 100644
--- a/src/gr/arb.c
+++ b/src/gr/arb.c
@@ -22,6 +22,7 @@
 #include "gr_generic.h"
 #include "gr_vec.h"
 #include "gr_poly.h"
+#include "nfloat.h"
 
 typedef struct
 {
@@ -244,6 +245,27 @@ _gr_arb_set_other(arb_t res, gr_srcptr x, gr_ctx_t x_ctx, gr_ctx_t ctx)
                 return GR_DOMAIN;
             }
 
+        case GR_CTX_NFLOAT:
+            if (NFLOAT_IS_SPECIAL(x))
+            {
+                if (NFLOAT_IS_ZERO(x))
+                {
+                    arb_zero(res);
+                    return GR_SUCCESS;
+                }
+                else
+                {
+                    return GR_UNABLE;
+                }
+            }
+            else
+            {
+                nfloat_get_arf(arb_midref(res), x, x_ctx);
+                mag_zero(arb_radref(res));
+                arb_set_round(res, res, ARB_CTX_PREC(ctx));
+                return GR_SUCCESS;
+            }
+
         case GR_CTX_RR_ARB:
             arb_set_round(res, x, ARB_CTX_PREC(ctx));
             return GR_SUCCESS;
diff --git a/src/nfloat.h b/src/nfloat.h
index eb7c834329..00b9b2d5e8 100644
--- a/src/nfloat.h
+++ b/src/nfloat.h
@@ -298,6 +298,8 @@ int nfloat_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 int nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 int nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 
+int nfloat_mul_2exp_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx);
+
 int nfloat_inv(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx);
 int nfloat_div(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx);
 int nfloat_div_ui(nfloat_ptr res, nfloat_srcptr x, ulong y, gr_ctx_t ctx);
diff --git a/src/nfloat/ctx.c b/src/nfloat/ctx.c
index 69ec78cd36..d23493342e 100644
--- a/src/nfloat/ctx.c
+++ b/src/nfloat/ctx.c
@@ -99,8 +99,8 @@ gr_method_tab_input _nfloat_methods_input[] =
 */
     {GR_METHOD_INV,             (gr_funcptr) nfloat_inv},
 
-/*
     {GR_METHOD_MUL_2EXP_SI,        (gr_funcptr) nfloat_mul_2exp_si},
+/*
     {GR_METHOD_MUL_2EXP_FMPZ,      (gr_funcptr) nfloat_mul_2exp_fmpz},
     {GR_METHOD_SET_FMPZ_2EXP_FMPZ, (gr_funcptr) nfloat_set_fmpz_2exp_fmpz},
     {GR_METHOD_GET_FMPZ_2EXP_FMPZ, (gr_funcptr) nfloat_get_fmpz_2exp_fmpz},
diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c
index 79d9e12792..54b7c8a3ff 100644
--- a/src/nfloat/nfloat.c
+++ b/src/nfloat/nfloat.c
@@ -11,6 +11,7 @@
 
 #include <limits.h>
 #include <mpfr.h>
+#include "long_extras.h"
 #include "fmpz.h"
 #include "arf.h"
 #include "arb.h"
@@ -1033,11 +1034,33 @@ nfloat_mul(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx)
     return GR_SUCCESS;
 }
 
+int
+nfloat_mul_2exp_si(nfloat_ptr res, nfloat_srcptr x, slong y, gr_ctx_t ctx)
+{
+    if (NFLOAT_IS_SPECIAL(x))
+    {
+        return nfloat_set(res, x, ctx);
+    }
+    else
+    {
+        /* todo */
+        if (y < NFLOAT_MIN_EXP || y > NFLOAT_MAX_EXP)
+            return GR_UNABLE;
+
+        nfloat_set(res, x, ctx);
+        NFLOAT_EXP(res) += y;
+        NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+        return GR_SUCCESS;
+    }
+}
+
+
 int
 nfloat_inv(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
 {
     mpfr_t rf, xf;
     slong prec = NFLOAT_CTX_PREC(ctx);
+    slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
 
     if (NFLOAT_IS_SPECIAL(x))
     {
@@ -1047,6 +1070,15 @@ nfloat_inv(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
             return nfloat_nan(res, ctx); /* todo */
     }
 
+    if (NFLOAT_D(x)[nlimbs - 1] == (UWORD(1) << (FLINT_BITS - 1)) &&
+        flint_mpn_zero_p(NFLOAT_D(x), nlimbs - 1))
+    {
+        nfloat_set(res, x, ctx);
+        NFLOAT_EXP(res) = 2 - NFLOAT_EXP(res);
+        NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
+        return GR_SUCCESS;
+    }
+
     /* todo: make sure aliasing is correct */
 
     rf->_mpfr_d = NFLOAT_D(res);
@@ -1245,17 +1277,22 @@ nfloat_sqrt(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
         return GR_SUCCESS;
     }
 
-    xf->_mpfr_d = NFLOAT_D(x);
-    xf->_mpfr_prec = prec;
-    xf->_mpfr_sign = 1;
-    xf->_mpfr_exp = odd_exp;
-
     if (res == x)
     {
-        mpfr_sqrt(xf, xf, MPFR_RNDZ);
+        zf->_mpfr_d = NFLOAT_D(x);
+        zf->_mpfr_prec = prec;
+        zf->_mpfr_sign = 1;
+        zf->_mpfr_exp = odd_exp;
+
+        mpfr_sqrt(zf, zf, MPFR_RNDZ);
     }
     else
     {
+        xf->_mpfr_d = NFLOAT_D(x);
+        xf->_mpfr_prec = prec;
+        xf->_mpfr_sign = 1;
+        xf->_mpfr_exp = odd_exp;
+
         zf->_mpfr_d = NFLOAT_D(res);
         zf->_mpfr_prec = prec;
         zf->_mpfr_sign = 1;
@@ -1305,17 +1342,22 @@ nfloat_rsqrt(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx)
         return GR_SUCCESS;
     }
 
-    xf->_mpfr_d = NFLOAT_D(x);
-    xf->_mpfr_prec = prec;
-    xf->_mpfr_sign = 1;
-    xf->_mpfr_exp = odd_exp;
-
     if (res == x)
     {
-        mpfr_rec_sqrt(xf, xf, MPFR_RNDZ);
+        zf->_mpfr_d = NFLOAT_D(x);
+        zf->_mpfr_prec = prec;
+        zf->_mpfr_sign = 1;
+        zf->_mpfr_exp = odd_exp;
+
+        mpfr_rec_sqrt(zf, zf, MPFR_RNDZ);
     }
     else
     {
+        xf->_mpfr_d = NFLOAT_D(x);
+        xf->_mpfr_prec = prec;
+        xf->_mpfr_sign = 1;
+        xf->_mpfr_exp = odd_exp;
+
         zf->_mpfr_d = NFLOAT_D(res);
         zf->_mpfr_prec = prec;
         zf->_mpfr_sign = 1;
diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c
index ffb8d61031..e7ccf8f040 100644
--- a/src/nfloat/test/t-nfloat.c
+++ b/src/nfloat/test/t-nfloat.c
@@ -11,9 +11,179 @@
 
 #include "test_helpers.h"
 #include "gr_vec.h"
+#include "gr_special.h"
 #include "arf.h"
 #include "nfloat.h"
 
+int
+gr_test_approx_unary_op(gr_ctx_t R, gr_method_unary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags)
+{
+    int status = GR_SUCCESS;
+    int alias;
+    int cmp;
+    gr_ptr a, b, a_ref, b_ref, rel_err;
+
+    GR_TMP_INIT2(a, b, R);
+    GR_TMP_INIT3(a_ref, b_ref, rel_err, R_ref);
+
+    alias = n_randint(state, 2);
+
+    status |= gr_randtest(a, state, R);
+    status |= gr_randtest(b, state, R);
+
+    status |= gr_set_other(a_ref, a, R, R_ref);
+
+    if (status == GR_SUCCESS)
+    {
+        if (alias == 0)
+        {
+            status |= op(b, a, R);
+            status |= op(b_ref, a_ref, R_ref);
+        }
+        else
+        {
+            status |= gr_set(b, a, R);
+            status |= op(b, b, R);
+            status |= op(b_ref, a_ref, R_ref);
+        }
+
+        if (status == GR_SUCCESS)
+        {
+            status |= gr_set_other(rel_err, b, R, R_ref);
+
+            status |= gr_sub(rel_err, b_ref, rel_err, R_ref);
+            status |= gr_div(rel_err, rel_err, b_ref, R_ref);
+            status |= gr_abs(rel_err, rel_err, R_ref);
+            status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref);
+
+            if (status == GR_SUCCESS && cmp > 0)
+                status = GR_TEST_FAIL;
+
+            if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
+            {
+                flint_printf("\n");
+                gr_ctx_println(R);
+                gr_ctx_println(R_ref);
+                flint_printf("alias: %d\n", alias);
+                flint_printf("Computed:\n");
+                flint_printf("a = "); gr_println(a, R);
+                flint_printf("op(a) = "); gr_println(b, R);
+                flint_printf("Reference:\n");
+                flint_printf("a = "); gr_println(a_ref, R_ref);
+                flint_printf("op(a) = "); gr_println(b_ref, R_ref);
+                flint_printf("\nrel_err = "); gr_println(rel_err, R_ref);
+                flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref);
+                flint_printf("\n");
+            }
+        }
+    }
+
+    if (status == GR_TEST_FAIL)
+        flint_abort();
+
+    GR_TMP_CLEAR2(a, b, R);
+    GR_TMP_CLEAR3(a_ref, b_ref, rel_err, R_ref);
+
+    return status;
+}
+
+int
+gr_test_approx_binary_op(gr_ctx_t R, gr_method_binary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags)
+{
+    int status = GR_SUCCESS;
+    int alias;
+    int cmp;
+    gr_ptr a, b, c, a_ref, b_ref, c_ref, rel_err;
+
+    GR_TMP_INIT3(a, b, c, R);
+    GR_TMP_INIT4(a_ref, b_ref, c_ref, rel_err, R_ref);
+
+    alias = n_randint(state, 5);
+
+    status |= gr_randtest(a, state, R);
+    status |= gr_randtest(b, state, R);
+    status |= gr_randtest(c, state, R);
+
+    status |= gr_set_other(a_ref, a, R, R_ref);
+    status |= gr_set_other(b_ref, b, R, R_ref);
+
+    if (status == GR_SUCCESS)
+    {
+        if (alias == 0)
+        {
+            status |= op(c, a, b, R);
+            status |= op(c_ref, a_ref, b_ref, R_ref);
+        }
+        else if (alias == 1)
+        {
+            status |= gr_set(c, a, R);
+            status |= op(c, c, b, R);
+            status |= op(c_ref, a_ref, b_ref, R_ref);
+        }
+        else if (alias == 2)
+        {
+            status |= gr_set(c, b, R);
+            status |= op(c, a, c, R);
+            status |= op(c_ref, a_ref, b_ref, R_ref);
+        }
+        else if (alias == 3)
+        {
+            status |= gr_set(b, a, R);
+            status |= gr_set(b_ref, a_ref, R_ref);
+            status |= op(c, a, a, R);
+            status |= op(c_ref, a_ref, a_ref, R_ref);
+        }
+        else
+        {
+            status |= gr_set(b, a, R);
+            status |= gr_set(c, a, R);
+            status |= gr_set(b_ref, a_ref, R_ref);
+            status |= op(c, c, c, R);
+            status |= op(c_ref, a_ref, a_ref, R_ref);
+        }
+
+        if (status == GR_SUCCESS)
+        {
+            status |= gr_set_other(rel_err, c, R, R_ref);
+
+            status |= gr_sub(rel_err, c_ref, rel_err, R_ref);
+            status |= gr_div(rel_err, rel_err, c_ref, R_ref);
+            status |= gr_abs(rel_err, rel_err, R_ref);
+            status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref);
+
+            if (status == GR_SUCCESS && cmp > 0)
+                status = GR_TEST_FAIL;
+
+            if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
+            {
+                flint_printf("\n");
+                gr_ctx_println(R);
+                gr_ctx_println(R_ref);
+                flint_printf("alias: %d\n", alias);
+                flint_printf("Computed:\n");
+                flint_printf("a = "); gr_println(a, R);
+                flint_printf("b = "); gr_println(b, R);
+                flint_printf("a (op) b = "); gr_println(c, R);
+                flint_printf("Reference:\n");
+                flint_printf("a = "); gr_println(a_ref, R_ref);
+                flint_printf("b = "); gr_println(b_ref, R_ref);
+                flint_printf("a (op) b = "); gr_println(c_ref, R_ref);
+                flint_printf("\nrel_err = "); gr_println(rel_err, R_ref);
+                flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref);
+                flint_printf("\n");
+            }
+        }
+    }
+
+    if (status == GR_TEST_FAIL)
+        flint_abort();
+
+    GR_TMP_CLEAR3(a, b, c, R);
+    GR_TMP_CLEAR4(a_ref, b_ref, c_ref, rel_err, R_ref);
+
+    return status;
+}
+
 void
 nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
 {
@@ -157,13 +327,77 @@ nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
 TEST_FUNCTION_START(nfloat, state)
 {
     gr_ctx_t ctx;
+    gr_ctx_t ctx2;
     slong prec;
 
     for (prec = NFLOAT_MIN_LIMBS * FLINT_BITS; prec <= NFLOAT_MAX_LIMBS * FLINT_BITS; prec += FLINT_BITS)
     {
         nfloat_ctx_init(ctx, prec, 0);
+
         gr_test_floating_point(ctx, 100 * flint_test_multiplier(), 0);
+
         nfloat_test_dot(state, (prec <= 128 ? 10000 : 100) * flint_test_multiplier(), ctx);
+
+        {
+            gr_ptr tol;
+            slong i, reps;
+
+            gr_ctx_init_real_arb(ctx2, prec + 32);
+
+            tol = gr_heap_init(ctx2);
+
+            GR_IGNORE(gr_one(tol, ctx2));
+            GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 1, ctx2));
+
+            reps = (prec <= 128 ? 1000 : 1) * flint_test_multiplier();
+
+            for (i = 0; i < reps; i++)
+            {
+                gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_mul, ctx2, tol, state, 0);
+                gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_div, ctx2, tol, state, 0);
+                gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_pow, ctx2, tol, state, 0);
+
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_neg, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_abs, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_sgn, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_inv, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_sqrt, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_rsqrt, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_floor, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_ceil, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_nint, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_trunc, ctx2, tol, state, 0);
+            }
+
+            reps = (prec <= 256 ? 10 : 0) * flint_test_multiplier();
+
+            for (i = 0; i < reps; i++)
+            {
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_exp, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_expm1, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_log, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_log1p, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_sin, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_cos, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_tan, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_sinh, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_cosh, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_tanh, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_atan, ctx2, tol, state, 0);
+            }
+
+            reps = (prec <= 256 ? 1 : 0) * flint_test_multiplier();
+
+            for (i = 0; i < reps; i++)
+            {
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_gamma, ctx2, tol, state, 0);
+                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_zeta, ctx2, tol, state, 0);
+            }
+
+            gr_heap_clear(tol, ctx2);
+            gr_ctx_clear(ctx2);
+        }
+
         gr_ctx_clear(ctx);
     }
 

From 01b87427b191bfa044b95fc0fea8041336ff9652 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 9 May 2024 11:28:04 +0200
Subject: [PATCH 11/12] more tests and corrections

---
 src/nfloat.h               |  2 +-
 src/nfloat/nfloat.c        |  2 +-
 src/nfloat/test/t-nfloat.c | 54 ++++++++++++++++++++++++++++++++++++++
 src/python/flint_ctypes.py | 39 +++++++++++++++++++++++++++
 4 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/src/nfloat.h b/src/nfloat.h
index 00b9b2d5e8..a468765dc3 100644
--- a/src/nfloat.h
+++ b/src/nfloat.h
@@ -231,7 +231,7 @@ _nfloat_set_mpn_2exp(nfloat_ptr res, mp_srcptr x, mp_size_t xn, slong exp, int x
 
         if (xn > nlimbs)
         {
-            mpn_lshift(NFLOAT_D(res), x + xn - nlimbs, xn, norm);
+            mpn_lshift(NFLOAT_D(res), x + xn - nlimbs, nlimbs, norm);
             NFLOAT_D(res)[0] |= (x[xn - nlimbs - 1] >> (FLINT_BITS - norm));
         }
         else
diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c
index 54b7c8a3ff..8e781f9a7d 100644
--- a/src/nfloat/nfloat.c
+++ b/src/nfloat/nfloat.c
@@ -1427,7 +1427,7 @@ int nfloat_pi(nfloat_ptr res, gr_ctx_t ctx)
 
     FLINT_ASSERT(nlimbs <= ARB_PI4_TAB_LIMBS);
 
-    NFLOAT_EXP(res) = 0;
+    NFLOAT_EXP(res) = 2;
     NFLOAT_SGNBIT(res) = 0;
     flint_mpn_copyi(NFLOAT_D(res), arb_pi4_tab + ARB_PI4_TAB_LIMBS - nlimbs, nlimbs);
 
diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c
index e7ccf8f040..014d0eb3e6 100644
--- a/src/nfloat/test/t-nfloat.c
+++ b/src/nfloat/test/t-nfloat.c
@@ -338,6 +338,60 @@ TEST_FUNCTION_START(nfloat, state)
 
         nfloat_test_dot(state, (prec <= 128 ? 10000 : 100) * flint_test_multiplier(), ctx);
 
+        {
+            gr_ptr x, x2;
+            slong i;
+
+            gr_ctx_init_real_arb(ctx2, 2 + n_randint(state, 500));
+            x = gr_heap_init(ctx);
+            x2 = gr_heap_init(ctx2);
+            for (i = 0; i < 10; i++)
+            {
+                GR_MUST_SUCCEED(gr_randtest(x, state, ctx));
+                GR_MUST_SUCCEED(gr_set_other(x2, x, ctx, ctx2));
+                GR_MUST_SUCCEED(gr_set_other(x, x2, ctx2, ctx));
+            }
+            gr_heap_clear(x, ctx);
+            gr_heap_clear(x2, ctx2);
+            gr_ctx_clear(ctx2);
+
+            gr_ctx_init_fmpz(ctx2);
+            x = gr_heap_init(ctx);
+            x2 = gr_heap_init(ctx2);
+            for (i = 0; i < 10; i++)
+            {
+                GR_MUST_SUCCEED(gr_randtest(x2, state, ctx2));
+                GR_MUST_SUCCEED(gr_set_other(x, x2, ctx2, ctx));
+            }
+            gr_heap_clear(x, ctx);
+            gr_heap_clear(x2, ctx2);
+            gr_ctx_clear(ctx2);
+
+            gr_ctx_init_fmpq(ctx2);
+            x = gr_heap_init(ctx);
+            x2 = gr_heap_init(ctx2);
+            for (i = 0; i < 10; i++)
+            {
+                GR_MUST_SUCCEED(gr_randtest(x2, state, ctx2));
+                GR_MUST_SUCCEED(gr_set_other(x, x2, ctx2, ctx));
+            }
+            gr_heap_clear(x, ctx);
+            gr_heap_clear(x2, ctx2);
+            gr_ctx_clear(ctx2);
+
+            gr_ctx_init_real_qqbar(ctx2);
+            x = gr_heap_init(ctx);
+            x2 = gr_heap_init(ctx2);
+            for (i = 0; i < 3; i++)
+            {
+                GR_MUST_SUCCEED(gr_randtest(x2, state, ctx2));
+                GR_MUST_SUCCEED(gr_set_other(x, x2, ctx2, ctx));
+            }
+            gr_heap_clear(x, ctx);
+            gr_heap_clear(x2, ctx2);
+            gr_ctx_clear(ctx2);
+        }
+
         {
             gr_ptr tol;
             slong i, reps;
diff --git a/src/python/flint_ctypes.py b/src/python/flint_ctypes.py
index f68df40d51..78200dc0c4 100644
--- a/src/python/flint_ctypes.py
+++ b/src/python/flint_ctypes.py
@@ -8188,6 +8188,45 @@ def test_integers_mod():
     assert raises(lambda: IntegersMod_mpn_mod(10**20)(IntegersMod_mpn_mod(10**20 + 1)(17)), NotImplementedError)
     assert raises(lambda: IntegersMod_mpn_mod(10**20)(IntegersMod_mpn_mod(10**50)(17)), NotImplementedError)
 
+def test_nfloat():
+    R = RealFloat_nfloat(128)
+    R2 = RealField_arb(192)
+    R3 = RealFloat_nfloat(256)
+    tol = R(2.0**(-120))
+    tol2 = R2(2.0**(-120))
+    assert R2(R(3)) == 3
+    assert R(R2(3)) == 3
+    assert R2(R(0)) == 0
+    assert R(R2(0)) == 0
+    assert abs(R(R.pi() - R2.pi())) < tol
+    assert abs(R2(R.pi() - R2.pi())) < tol
+    assert abs(R(R.pi() - R2.pi())) < tol2
+    assert abs(R2(R.pi() - R2.pi())) < tol2
+    assert abs(R(QQ(1)/3) - QQ(1)/3) < tol
+    assert R(ZZ(5)) == 5
+    assert R(-ZZ(5)) == -5
+    c = ZZ(5)**100
+    assert abs(R2(R(c)) - c) < tol * c
+    assert abs(R2(R(-c)) - (-c)) < tol * c
+    assert R(R2(5)) == R(5)
+    assert str(R(0)) == '0'
+    assert str(R(1) / 4) == '0.250000000000000000000000000000000000000'
+    assert R("0.25") == 0.25
+    assert R(-0.25) == -0.25
+    assert abs(R(R.pi() - R3.pi())) < tol
+    assert abs(R3(R.pi() - R3.pi())) < tol
+    assert R3(R(0)) == 0
+    assert R3(R(-1)) == -1
+    assert R(R3(0)) == 0
+    assert R(-R3(1)) == -1
+    assert R(3) <= R(3)
+    assert R(-3) <= R(3)
+    assert not (R(3) < R(3))
+    assert not (R(3) <= R(-3))
+    assert R(0) <= R(3)
+    assert R(0) <= R(0)
+    assert not (R(0) < R(0))
+    assert not (R(0) <= R(-3))
 
 def test_gen_name():
     for R in [NumberField(ZZx.gen() ** 2 + 1, "b"),

From 2859c7e3367b8dc5aae3a3cb05cdf99d8e64414c Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 9 May 2024 18:54:51 +0200
Subject: [PATCH 12/12] more test code

---
 src/nfloat/test/t-nfloat.c | 244 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 243 insertions(+), 1 deletion(-)

diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c
index 014d0eb3e6..b808531d63 100644
--- a/src/nfloat/test/t-nfloat.c
+++ b/src/nfloat/test/t-nfloat.c
@@ -13,8 +13,52 @@
 #include "gr_vec.h"
 #include "gr_special.h"
 #include "arf.h"
+#include "fmpq.h"
 #include "nfloat.h"
 
+int
+gr_test_cmp_fun(gr_ctx_t R, gr_method_binary_op_get_int op, gr_ctx_t R_ref, flint_rand_t state, int test_flags)
+{
+    int status = GR_SUCCESS;
+    int cmp1, cmp2;
+    gr_ptr a, b, a_ref, b_ref;
+
+    GR_TMP_INIT2(a, b, R);
+    GR_TMP_INIT2(a_ref, b_ref, R_ref);
+
+    status |= gr_randtest(a, state, R);
+    status |= gr_randtest(b, state, R);
+
+    status |= gr_set_other(a_ref, a, R, R_ref);
+    status |= gr_set_other(b_ref, b, R, R_ref);
+
+    status |= op(&cmp1, a, b, R);
+    status |= op(&cmp2, a_ref, b_ref, R_ref);
+
+    if (status == GR_SUCCESS && cmp1 != cmp2)
+        status = GR_TEST_FAIL;
+
+    if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
+    {
+        flint_printf("\n");
+        gr_ctx_println(R);
+        gr_ctx_println(R_ref);
+        flint_printf("a = "); gr_println(a, R);
+        flint_printf("b = "); gr_println(b, R);
+        flint_printf("cmp1 = %d\n", cmp1);
+        flint_printf("cmp2 = %d\n", cmp2);
+        flint_printf("\n");
+    }
+
+    if (status == GR_TEST_FAIL)
+        flint_abort();
+
+    GR_TMP_CLEAR2(a, b, R);
+    GR_TMP_CLEAR2(a_ref, b_ref, R_ref);
+
+    return status;
+}
+
 int
 gr_test_approx_unary_op(gr_ctx_t R, gr_method_unary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags)
 {
@@ -184,6 +228,167 @@ gr_test_approx_binary_op(gr_ctx_t R, gr_method_binary_op op, gr_ctx_t R_ref, gr_
     return status;
 }
 
+int
+gr_test_approx_binary_op_type_variants(gr_ctx_t R,
+    const char * opname,
+    gr_method_binary_op gr_op,
+    gr_method_binary_op_ui gr_op_ui,
+    gr_method_binary_op_si gr_op_si,
+    gr_method_binary_op_fmpz gr_op_fmpz,
+    gr_method_binary_op_fmpq gr_op_fmpq,
+    int fused,
+    int small_test_values,
+    gr_srcptr rel_tol, flint_rand_t state, int test_flags)
+{
+    int status, alias, which;
+    gr_ptr x, y, xy1, xy2, err;
+    ulong uy;
+    slong sy;
+    fmpz_t zy;
+    fmpq_t qy;
+
+    GR_TMP_INIT5(x, y, xy1, xy2, err, R);
+    fmpz_init(zy);
+    fmpq_init(qy);
+
+    which = 0;
+
+    if (small_test_values)
+    {
+        uy = n_randint(state, 6);
+        sy = -5 + (slong) n_randint(state, 11);
+        fmpz_randtest(zy, state, 3);
+        fmpq_randtest(qy, state, 3);
+    }
+    else
+    {
+        uy = n_randtest(state);
+        sy = (slong) n_randtest(state);
+
+        if (n_randint(state, 10) == 0)
+        {
+            fmpz_randtest(zy, state, 10000);
+            fmpq_randtest(qy, state, 200);
+        }
+        else
+        {
+            fmpz_randtest(zy, state, 100);
+            fmpq_randtest(qy, state, 100);
+        }
+    }
+
+    for (which = 0; which < 4; which++)
+    {
+        status = GR_SUCCESS;
+        alias = n_randint(state, 2);
+
+        GR_MUST_SUCCEED(gr_randtest(x, state, R));
+        GR_MUST_SUCCEED(gr_randtest(y, state, R));
+        GR_MUST_SUCCEED(gr_randtest(xy1, state, R));
+
+        if (fused && alias)
+            GR_MUST_SUCCEED(gr_set(xy2, x, R));
+        else if (fused)
+            GR_MUST_SUCCEED(gr_set(xy2, xy1, R));
+        else
+            GR_MUST_SUCCEED(gr_randtest(xy2, state, R));
+
+        if (alias)
+            GR_MUST_SUCCEED(gr_set(xy1, x, R));
+
+        if (which == 0)
+        {
+            if (alias)
+                status |= gr_op_ui(xy1, xy1, uy, R);
+            else
+                status |= gr_op_ui(xy1, x, uy, R);
+
+            status |= gr_set_ui(y, uy, R);
+
+            if (status == GR_SUCCESS)
+                status |= gr_op(xy2, x, y, R);
+        }
+        else if (which == 1)
+        {
+            if (alias)
+                status |= gr_op_si(xy1, xy1, sy, R);
+            else
+                status |= gr_op_si(xy1, x, sy, R);
+            status |= gr_set_si(y, sy, R);
+
+            if (status == GR_SUCCESS)
+                status |= gr_op(xy2, x, y, R);
+        }
+        else if (which == 2)
+        {
+            if (alias)
+                status |= gr_op_fmpz(xy1, xy1, zy, R);
+            else
+                status |= gr_op_fmpz(xy1, x, zy, R);
+            status |= gr_set_fmpz(y, zy, R);
+
+            if (status == GR_SUCCESS)
+                status |= gr_op(xy2, x, y, R);
+        }
+        else
+        {
+            if (alias)
+                status |= gr_op_fmpq(xy1, xy1, qy, R);
+            else
+                status |= gr_op_fmpq(xy1, x, qy, R);
+            status |= gr_set_fmpq(y, qy, R);
+
+            if (status == GR_SUCCESS)
+                status |= gr_op(xy2, x, y, R);
+        }
+
+        if (status == GR_SUCCESS)
+        {
+            int cmp;
+
+            status |= gr_sub(err, xy1, xy2, R);
+            status |= gr_div(err, err, xy2, R);
+            status |= gr_abs(err, err, R);
+            status |= gr_cmp(&cmp, err, rel_tol, R);
+
+            if (status == GR_SUCCESS && cmp > 0)
+            {
+                status = GR_TEST_FAIL;
+                break;
+            }
+        }
+    }
+
+    if ((test_flags & GR_TEST_ALWAYS_ABLE) && (status & GR_UNABLE))
+        status = GR_TEST_FAIL;
+
+    if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
+    {
+        flint_printf("\n");
+        flint_printf("%s\n", opname);
+        gr_ctx_println(R);
+        flint_printf("which: %d\n", which);
+        flint_printf("alias: %d\n", alias);
+        flint_printf("x = "); gr_println(x, R);
+        flint_printf("y = "); gr_println(y, R);
+        flint_printf("y (op) y (1) = "); gr_println(xy1, R);
+        flint_printf("x (op) y (2) = "); gr_println(xy2, R);
+        flint_printf("err = "); gr_println(err, R);
+        flint_printf("tol = "); gr_println(rel_tol, R);
+        flint_printf("\n");
+    }
+
+    if (status == GR_TEST_FAIL)
+        flint_abort();
+
+    GR_TMP_CLEAR5(x, y, xy1, xy2, err, R);
+
+    fmpz_clear(zy);
+    fmpq_clear(qy);
+
+    return status;
+}
+
 void
 nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
 {
@@ -393,11 +598,12 @@ TEST_FUNCTION_START(nfloat, state)
         }
 
         {
-            gr_ptr tol;
+            gr_ptr tol1, tol;
             slong i, reps;
 
             gr_ctx_init_real_arb(ctx2, prec + 32);
 
+            tol1 = gr_heap_init(ctx);
             tol = gr_heap_init(ctx2);
 
             GR_IGNORE(gr_one(tol, ctx2));
@@ -407,6 +613,9 @@ TEST_FUNCTION_START(nfloat, state)
 
             for (i = 0; i < reps; i++)
             {
+                gr_test_cmp_fun(ctx, (gr_method_binary_op_get_int) gr_cmp, ctx2, state, 0);
+                gr_test_cmp_fun(ctx, (gr_method_binary_op_get_int) gr_cmpabs, ctx2, state, 0);
+
                 gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_mul, ctx2, tol, state, 0);
                 gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_div, ctx2, tol, state, 0);
                 gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_pow, ctx2, tol, state, 0);
@@ -421,6 +630,38 @@ TEST_FUNCTION_START(nfloat, state)
                 gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_ceil, ctx2, tol, state, 0);
                 gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_nint, ctx2, tol, state, 0);
                 gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_trunc, ctx2, tol, state, 0);
+
+                GR_IGNORE(gr_one(tol1, ctx));
+                GR_IGNORE(gr_mul_2exp_si(tol1, tol1, -prec + 4, ctx));
+                gr_test_approx_binary_op_type_variants(ctx,
+                    "mul",
+                    (gr_method_binary_op) gr_mul,
+                    (gr_method_binary_op_ui) gr_mul_ui,
+                    (gr_method_binary_op_si) gr_mul_si,
+                    (gr_method_binary_op_fmpz) gr_mul_fmpz,
+                    (gr_method_binary_op_fmpq) gr_mul_fmpq,
+                    0, 0, tol1, state, 0);
+
+                gr_test_approx_binary_op_type_variants(ctx,
+                    "div",
+                    (gr_method_binary_op) gr_div,
+                    (gr_method_binary_op_ui) gr_div_ui,
+                    (gr_method_binary_op_si) gr_div_si,
+                    (gr_method_binary_op_fmpz) gr_div_fmpz,
+                    (gr_method_binary_op_fmpq) gr_div_fmpq,
+                    0, 0, tol1, state, 0);
+
+                /* todo: test with large values also */
+                GR_IGNORE(gr_one(tol1, ctx));
+                GR_IGNORE(gr_mul_2exp_si(tol1, tol1, -prec + 6, ctx));
+                gr_test_approx_binary_op_type_variants(ctx,
+                    "pow",
+                    (gr_method_binary_op) gr_pow,
+                    (gr_method_binary_op_ui) gr_pow_ui,
+                    (gr_method_binary_op_si) gr_pow_si,
+                    (gr_method_binary_op_fmpz) gr_pow_fmpz,
+                    (gr_method_binary_op_fmpq) gr_pow_fmpq,
+                    0, 1, tol1, state, 0);
             }
 
             reps = (prec <= 256 ? 10 : 0) * flint_test_multiplier();
@@ -448,6 +689,7 @@ TEST_FUNCTION_START(nfloat, state)
                 gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_zeta, ctx2, tol, state, 0);
             }
 
+            gr_heap_clear(tol1, ctx);
             gr_heap_clear(tol, ctx2);
             gr_ctx_clear(ctx2);
         }