miniextendr_api/r_memory.rs
1//! Utilities for recovering R SEXPs from raw data pointers.
2//!
3//! R stores vector data at a fixed offset after the SEXPREC header. Given a
4//! pointer into that data region, we can subtract the header size to recover
5//! the SEXP — then verify it by reading raw memory fields (type tag, ALTREP
6//! bit, and vecsxp.length) without calling any R functions.
7//!
8//! This is used by:
9//! - Arrow integration: zero-copy IntoR when the buffer is R-backed
10//! - `Cow<[T]>` IntoR round-trip
11//!
12//! # Initialization
13//!
14//! [`init_sexprec_data_offset`] must be called during package init (before any
15//! recovery attempts). It measures the offset on a real R vector, so it works
16//! across R versions and platforms.
17//!
18//! # R's VECTOR_SEXPREC layout
19//!
20//! ```text
21//! // From R's Defn.h:
22//! typedef struct VECTOR_SEXPREC {
23//! SEXPREC_HEADER; // sxpinfo(8) + attrib(8) + gengc_next(8) + gengc_prev(8)
24//! struct vecsxp_struct { // length(8) + truelength(8)
25//! R_xlen_t length;
26//! R_xlen_t truelength;
27//! } vecsxp;
28//! } VECTOR_SEXPREC;
29//!
30//! typedef union { VECTOR_SEXPREC s; double align; } SEXPREC_ALIGN;
31//! #define STDVEC_DATAPTR(x) ((void *)(((SEXPREC_ALIGN *)(x)) + 1))
32//! ```
33//!
34//! On 64-bit: `sizeof(VECTOR_SEXPREC)` = 48 bytes, `sizeof(SEXPREC_ALIGN)` = 48.
35//! Data starts at `sexp + 48`. All vector types (REALSXP, INTSXP, RAWSXP,
36//! STRSXP, VECSXP) use the same `VECTOR_SEXPREC` header.
37//!
38//! # Why not `#[repr(C)]` mirror struct?
39//!
40//! A Rust `#[repr(C)]` struct mirroring `VECTOR_SEXPREC` would give a
41//! compile-time `size_of` instead of runtime measurement. However:
42//! - R's layout can vary by version and compile options (32-bit, padding)
43//! - The runtime measurement is one allocation at init — negligible
44//! - A `repr(C)` mirror struct doesn't help with the real safety issue:
45//! reading from a speculative pointer. `addr_of!` computes field addresses
46//! without dereferencing, but we still need to `read()` the type tag — and
47//! that read is from potentially invalid memory for non-R pointers.
48//!
49//! The verification (type tag + ALTREP check + XLENGTH) prevents false
50//! positives. Only the type tag requires a raw sxpinfo read; ALTREP and
51//! XLENGTH use R's public C API.
52//!
53//! # Safety of speculative reads
54//!
55//! The candidate pointer is computed from pointer arithmetic on the input
56//! data_ptr. For Rust-owned buffers (not R-backed), this points into
57//! arbitrary heap memory. We must be careful about which R functions we
58//! call on it:
59//!
60//! - **`ALTREP(x)`** — safe: just reads `x->sxpinfo.alt` (a single bit).
61//! - **`XLENGTH(x)`** on non-ALTREP — safe: reads `STDVEC_LENGTH` (struct
62//! field, no dispatch, no error).
63//! - **`LENGTH(x)`** — UNSAFE: wraps XLENGTH with `> INT_MAX` check that
64//! calls `R_BadLongVector()` (throws R error on garbage with large length).
65//! - **`DATAPTR_RO(x)`** — UNSAFE on ALTREP: dispatches through class vtable
66//! (bogus function pointers on garbage). On non-ALTREP: `STDVEC_DATAPTR`
67//! which also checks for long vectors.
68//!
69//! The verification sequence is:
70//! 1. Raw sxpinfo type tag (bits 0-4) — no public TYPEOF that's safe on garbage
71//! 2. `ALTREP(candidate)` — gates step 3 (rejects ALTREP before XLENGTH dispatch)
72//! 3. `XLENGTH(candidate)` — safe for non-ALTREP (STDVEC_LENGTH, no errors)
73
74use std::sync::atomic::{AtomicUsize, Ordering};
75
76use crate::ffi::{self, SEXP, SEXPTYPE, SexpExt};
77
78/// Offset in bytes from SEXP address to data pointer for standard (non-ALTREP) vectors.
79///
80/// `DATAPTR_RO(sexp) == (sexp as *const u8).add(SEXPREC_DATA_OFFSET)`
81///
82/// Zero means not yet initialized.
83static SEXPREC_DATA_OFFSET: AtomicUsize = AtomicUsize::new(0);
84
85/// Get the computed SEXPREC data offset.
86///
87/// Returns 0 if not yet initialized.
88#[inline]
89pub fn sexprec_data_offset() -> usize {
90 SEXPREC_DATA_OFFSET.load(Ordering::Relaxed)
91}
92
93/// Compute and store the SEXPREC data offset by measuring a real R vector.
94///
95/// Must be called from R's main thread during package init.
96///
97/// # Safety
98///
99/// Must be called on R's main thread with R initialized.
100pub unsafe fn init_sexprec_data_offset() {
101 unsafe {
102 let test = ffi::Rf_protect(ffi::Rf_allocVector(SEXPTYPE::REALSXP, 1));
103 let sexp_addr = test.0 as usize;
104 let data_addr = ffi::DATAPTR_RO(test) as usize;
105 SEXPREC_DATA_OFFSET.store(data_addr - sexp_addr, Ordering::Relaxed);
106 ffi::Rf_unprotect(1);
107 }
108}
109
110/// Try to recover the source R SEXP from a data pointer.
111///
112/// Given a pointer that may point into an R vector's data area, this
113/// subtracts the known SEXPREC header size to get a candidate SEXP, then
114/// verifies it:
115/// 1. The SEXP type tag (bits 0-4 of sxpinfo) matches `expected_type`
116/// 2. `ALTREP(candidate)` is false (only non-ALTREP vectors have fixed-offset data)
117/// 3. `XLENGTH(candidate)` matches `expected_len` (safe for non-ALTREP)
118///
119/// Returns `None` if:
120/// - The offset hasn't been initialized yet
121/// - The pointer doesn't come from an R vector
122/// - The candidate SEXP has the wrong type or length
123/// - The candidate is an ALTREP vector (data not at fixed offset from SEXP)
124///
125/// # Why this is outside Rust's memory model (see #63)
126///
127/// This is a conservative-GC-style probe, analogous to Boehm GC scanning
128/// the heap without allocation provenance. We compute a speculative pointer
129/// via `wrapping_byte_sub` (well-defined pointer arithmetic) and read the
130/// first 4 bytes (sxpinfo bits) to check whether the address looks like the
131/// start of a SEXPREC. For pointers that did not come from an R SEXP, that
132/// read has no valid allocation provenance under Rust's Stacked / Tree
133/// Borrows model — it's defined behavior at the hardware level (the heap
134/// is contiguous mapped memory), but Miri correctly flags it as UB.
135///
136/// We guard the read with a 4096-byte address floor (below which the
137/// candidate would cross into unmapped memory), the ALTREP bit check
138/// (prevents calling dispatch fns on garbage), and the length check
139/// (filters random garbage with high probability). Callers that cannot
140/// tolerate a false positive must not rely on this path alone.
141///
142/// To keep Miri green, the whole recovery is a no-op under `#[cfg(miri)]`:
143/// we always return `None`, and callers fall back to the copy path. This
144/// is not a correctness change — the copy path is always a valid alternative.
145///
146/// # Safety
147///
148/// Must be called on R's main thread. The data pointer must be valid
149/// (i.e., it must point to readable memory for at least `expected_len`
150/// elements, which is guaranteed if it came from an Arrow buffer).
151pub unsafe fn try_recover_r_sexp(
152 data_ptr: *const u8,
153 expected_type: SEXPTYPE,
154 expected_len: usize,
155) -> Option<SEXP> {
156 // Speculative sxpinfo read has no provenance for Rust-allocated buffers
157 // (see `#63`). Under Miri, skip the probe entirely so the copy path is
158 // exercised and the analyzer stays clean. No functional regression.
159 #[cfg(miri)]
160 {
161 let _ = (data_ptr, expected_type, expected_len);
162 return None;
163 }
164
165 #[cfg_attr(miri, allow(unreachable_code))]
166 let offset = SEXPREC_DATA_OFFSET.load(Ordering::Relaxed);
167 if offset == 0 {
168 return None;
169 }
170
171 // Zero-length vectors can't be recovered (R uses sentinel pointer 0x1,
172 // and empty Arrow buffers use dangling pointers).
173 if expected_len == 0 {
174 return None;
175 }
176
177 let data_addr = data_ptr as usize;
178
179 // Reject pointers that would wraparound or are in invalid ranges.
180 // R's sentinel for empty vectors is 0x1; wrapping_byte_sub on small
181 // addresses produces huge values (top of address space) → segfault.
182 // The 4096 threshold also guards against speculative reads near page
183 // boundaries — for non-R pointers (e.g. Rust-allocated Arrow buffers),
184 // subtracting the offset could land before the start of mapped memory.
185 if data_addr < offset.saturating_add(4096) {
186 return None;
187 }
188
189 // Compute candidate SEXP by subtracting header size.
190 // wrapping_byte_sub is defined behavior for all pointer arithmetic.
191 let candidate_ptr = (data_ptr as *mut ffi::SEXPREC).wrapping_byte_sub(offset);
192
193 let candidate = SEXP(candidate_ptr);
194
195 // Quick check: type tag (bits 0-4 of sxpinfo, which is the first field).
196 // For Rust-allocated buffers this reads arbitrary heap memory, but
197 // wrapping_sub ensures the pointer arithmetic itself is defined.
198 // The read is a plain u32 load from mapped heap.
199 // No public R API reads TYPEOF without side effects on invalid pointers,
200 // so a raw sxpinfo read is unavoidable here.
201 let sxpinfo_bits = unsafe { *(candidate.0 as *const u32) };
202 let type_bits = sxpinfo_bits & 0x1f;
203 if type_bits != expected_type as u32 {
204 return None;
205 }
206
207 // Reject ALTREP via R's public API (SexpExt::is_altrep → ALTREP(x)).
208 // ALTREP vectors store data via indirection — can't recover them.
209 // This also gates the xlength() call below: for non-ALTREP, XLENGTH is
210 // just STDVEC_LENGTH (a struct field read, no dispatch, no error).
211 // For ALTREP, XLENGTH dispatches through the class vtable which would
212 // crash on a garbage pointer.
213 if candidate.is_altrep() {
214 return None;
215 }
216
217 // For non-ALTREP, xlength() → Rf_xlength → STDVEC_LENGTH — a direct
218 // struct field read with no dispatch and no "long vectors not supported"
219 // error. (LENGTH() wraps XLENGTH with an INT_MAX check that can
220 // R_BadLongVector; XLENGTH itself never errors for non-ALTREP vectors.)
221 if candidate.len() != expected_len {
222 return None;
223 }
224
225 // No DATAPTR_RO round-trip check needed: for non-ALTREP vectors,
226 // STDVEC_DATAPTR(x) == (char*)x + SEXPREC_DATA_OFFSET, and we
227 // constructed candidate = data_ptr - offset, so the round-trip
228 // is tautologically true. The type + ALTREP + length checks above
229 // are the actual discriminators.
230
231 Some(candidate)
232}