Skip to main content

miniextendr_api/
encoding.rs

1//! Encoding / locale probing utilities.
2//!
3//! This module exists mainly for debugging + experiments around R's string
4//! encodings. R's runtime has both:
5//! - per-CHARSXP encoding tags (UTF-8 / Latin-1 / bytes / native)
6//! - global/locale-level settings (native encoding, UTF-8 locale flags)
7//!
8//! # Availability
9//!
10//! The global signals are **non-API** (from `Defn.h`) and require the `nonapi` feature.
11//! Additionally, these symbols are **not exported from R's shared library**, so
12//! `miniextendr_encoding_init()` only works when:
13//! - Embedding R via `miniextendr-engine` (which links directly to R internals)
14//! - Running on platforms where these symbols happen to be exported
15//!
16//! For R packages (loaded via `.Call`), these symbols are typically unavailable,
17//! so `miniextendr_encoding_init()` is **disabled by default** in the entrypoint.
18//! The module is still useful for standalone Rust applications embedding R.
19
20use std::sync::OnceLock;
21
22/// Cached snapshot of R's encoding / locale state at init time.
23#[derive(Debug, Clone)]
24pub struct REncodingInfo {
25    #[cfg(feature = "nonapi")]
26    /// R's reported native encoding (non-API).
27    pub native_encoding: Option<String>,
28    #[cfg(feature = "nonapi")]
29    /// Whether R thinks the current locale is UTF-8 (non-API).
30    pub utf8_locale: Option<bool>,
31    #[cfg(feature = "nonapi")]
32    /// Whether R thinks the current locale is Latin-1 (non-API).
33    pub latin1_locale: Option<bool>,
34    #[cfg(feature = "nonapi")]
35    /// Whether R has determined it's "known to be UTF-8" (non-API).
36    pub known_to_be_utf8: Option<bool>,
37}
38
39static ENCODING_INFO: OnceLock<REncodingInfo> = OnceLock::new();
40
41/// Return the cached encoding info (if `miniextendr_encoding_init()` has run).
42#[inline]
43pub fn encoding_info() -> Option<&'static REncodingInfo> {
44    ENCODING_INFO.get()
45}
46
47/// Assert that R's locale is UTF-8.
48///
49/// Called once from `R_init_*` (package init). Errors if the R session
50/// does not use UTF-8, since `charsxp_to_str` assumes all CHARSXP bytes
51/// are valid UTF-8.
52///
53/// Uses `l10n_info()[["UTF-8"]]` which is public R API.
54#[unsafe(no_mangle)]
55pub extern "C" fn miniextendr_assert_utf8_locale() {
56    debug_assert!(
57        crate::worker::is_r_main_thread(),
58        "must be called from R main thread"
59    );
60    use crate::ffi::{R_BaseEnv, Rf_eval, Rf_install, Rf_protect, Rf_unprotect, SexpExt};
61
62    unsafe {
63        // Call l10n_info()
64        let call = crate::ffi::Rf_lang1(Rf_install(c"l10n_info".as_ptr()));
65        Rf_protect(call);
66        let info = Rf_eval(call, R_BaseEnv);
67        Rf_protect(info);
68
69        // Find the "UTF-8" element by name
70        let names = info.get_names();
71        let n = info.xlength();
72        let mut is_utf8 = false;
73        for i in 0..n {
74            let name_charsxp = names.string_elt(i);
75            let name_ptr = name_charsxp.r_char();
76            let name = std::ffi::CStr::from_ptr(name_ptr);
77            if name == c"UTF-8" {
78                let elt = info.vector_elt(i);
79                is_utf8 = elt.logical_elt(0) != 0;
80                break;
81            }
82        }
83
84        Rf_unprotect(2);
85
86        if !is_utf8 {
87            crate::ffi::Rf_error_unchecked(
88                c"%s".as_ptr(),
89                c"miniextendr requires a UTF-8 locale (R >= 4.2.0 uses UTF-8 by default)".as_ptr(),
90            );
91        }
92    }
93}
94
95/// Initialize / snapshot R's encoding state.
96///
97/// Intended to be called once from `R_init_*` (package init).
98#[unsafe(no_mangle)]
99pub extern "C-unwind" fn miniextendr_encoding_init() {
100    let _ = ENCODING_INFO.get_or_init(|| {
101        #[cfg(feature = "nonapi")]
102        unsafe {
103            use crate::ffi::{Rboolean, nonapi_encoding};
104
105            let native_encoding = {
106                let ptr = nonapi_encoding::R_nativeEncoding();
107                if ptr.is_null() {
108                    None
109                } else {
110                    Some(std::ffi::CStr::from_ptr(ptr).to_string_lossy().into_owned())
111                }
112            };
113
114            let utf8_locale = Some(nonapi_encoding::utf8locale != Rboolean::FALSE);
115            let latin1_locale = Some(nonapi_encoding::latin1locale != Rboolean::FALSE);
116            let known_to_be_utf8 = Some(nonapi_encoding::known_to_be_utf8 != Rboolean::FALSE);
117
118            let info = REncodingInfo {
119                native_encoding,
120                utf8_locale,
121                latin1_locale,
122                known_to_be_utf8,
123            };
124
125            if std::env::var_os("MINIEXTENDR_ENCODING_DEBUG").is_some() {
126                let msg = format!("[miniextendr] encoding init: {info:?}\n");
127                if let Ok(c_msg) = std::ffi::CString::new(msg) {
128                    crate::ffi::REprintf_unchecked(c"%s".as_ptr(), c_msg.as_ptr());
129                }
130            }
131
132            info
133        }
134
135        #[cfg(not(feature = "nonapi"))]
136        {
137            REncodingInfo {}
138        }
139    });
140}