diff --git a/pure-magic/Cargo.toml b/pure-magic/Cargo.toml index 9f6cc18..c61800c 100644 --- a/pure-magic/Cargo.toml +++ b/pure-magic/Cargo.toml @@ -9,6 +9,10 @@ repository.workspace = true keywords = ["magic", "file", "mime", "identification", "libmagic"] description = "Safe Rust re-implementation of libmagic" +[features] +default = [] +sync = [] + [dependencies] chrono = "0.4.41" dyf = { version = "0.1.1", features = ["serde"] } diff --git a/pure-magic/src/dou.rs b/pure-magic/src/dou.rs new file mode 100644 index 0000000..4ff4255 --- /dev/null +++ b/pure-magic/src/dou.rs @@ -0,0 +1,162 @@ +#[cfg(not(feature = "sync"))] +pub(crate) use default::*; + +#[cfg(feature = "sync")] +pub(crate) use sync::*; + +#[cfg(not(feature = "sync"))] +mod default { + use std::cell::{OnceCell, RefCell}; + + use serde::{Deserialize, Serialize}; + + use crate::EntryNode; + + /// Deserialize-on-Use wrapper for an `EntryNode`. + /// + /// This struct implements the "Deserialize on Use" pattern, where the `EntryNode` is + /// deserialized only when first accessed via `get_or_de()`. The serialized form is stored + /// in a `RefCell>>`, allowing for interior mutability, while the actual + /// `EntryNode` is stored in a `OnceCell`, ensuring it is deserialized at most once. + /// + /// The serialized data is consumed during deserialization, ensuring it is only used once. + #[derive(Debug, Deserialize)] + pub(crate) struct DouEntryNode { + ser: RefCell>>, + #[serde(skip)] + entry: OnceCell, + } + + impl From for DouEntryNode { + fn from(value: EntryNode) -> Self { + let cell = OnceCell::new(); + cell.set(value).unwrap(); + Self { + ser: RefCell::new(None), + entry: cell, + } + } + } + + impl Serialize for DouEntryNode { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + #[derive(Debug, Serialize)] + struct Tmp { + ser: RefCell>>, + } + + let ser = self.ser.borrow_mut(); + let tmp = if ser.is_none() { + Tmp { + ser: RefCell::new(Some( + bincode::serde::encode_to_vec( + self.entry.get().unwrap(), + bincode::config::standard(), + ) + .unwrap(), + )), + } + } else { + Tmp { + ser: self.ser.clone(), + } + }; + + tmp.serialize(serializer) + } + } + + impl DouEntryNode { + /// Returns a reference to the entry node. + pub(crate) fn get_or_de(&self) -> &EntryNode { + self.entry.get_or_init(|| { + let ser = self.ser.borrow_mut().take().unwrap(); + let (e, _) = + bincode::serde::decode_from_slice(&ser, bincode::config::standard()).unwrap(); + e + }) + } + } +} + +#[cfg(feature = "sync")] +mod sync { + use std::sync::{OnceLock, RwLock}; + + use serde::{Deserialize, Serialize}; + + use crate::EntryNode; + + /// Thread-safe Deserialize-on-Use wrapper for an `EntryNode`. + /// + /// This struct implements the "Deserialize on Use" pattern in a thread-safe manner. + /// The `EntryNode` is deserialized only when first accessed via `get_or_de()`. + /// The serialized form is stored in an `RwLock>>`, allowing for concurrent + /// read access or exclusive write access. The actual `EntryNode` is stored in a `OnceLock`, + /// ensuring thread-safe, one-time deserialization. + /// + /// The serialized data is consumed during deserialization, ensuring it is only used once. + #[derive(Debug, Deserialize)] + pub(crate) struct DouEntryNode { + ser: RwLock>>, + #[serde(skip)] + entry: OnceLock, + } + + impl From for DouEntryNode { + fn from(value: EntryNode) -> Self { + let cell = OnceLock::new(); + cell.set(value).unwrap(); + Self { + ser: RwLock::new(None), + entry: cell, + } + } + } + + impl Serialize for DouEntryNode { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + #[derive(Debug, Serialize)] + struct Tmp { + ser: RwLock>>, + } + + let ser = self.ser.read().unwrap(); + let tmp = if ser.is_none() { + Tmp { + ser: RwLock::new(Some( + bincode::serde::encode_to_vec( + self.entry.get().unwrap(), + bincode::config::standard(), + ) + .unwrap(), + )), + } + } else { + Tmp { + ser: RwLock::new(ser.clone()), + } + }; + + tmp.serialize(serializer) + } + } + + impl DouEntryNode { + /// Returns a reference to the entry node. + pub(crate) fn get_or_de(&self) -> &EntryNode { + self.entry.get_or_init(|| { + let ser = self.ser.write().unwrap().take().unwrap(); + let (e, _) = + bincode::serde::decode_from_slice(&ser, bincode::config::standard()).unwrap(); + e + }) + } + } +} diff --git a/pure-magic/src/lib.rs b/pure-magic/src/lib.rs index 0ed8d4b..0bd4cae 100644 --- a/pure-magic/src/lib.rs +++ b/pure-magic/src/lib.rs @@ -167,6 +167,7 @@ use crate::{ utils::{decode_id3, find_json_boundaries, run_utf8_validation}, }; +mod dou; mod numeric; mod parser; mod utils; @@ -2320,7 +2321,7 @@ enum Entry<'span> { Flag(Span<'span>, Flag), } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] struct EntryNode { root: bool, entry: Match, @@ -2547,11 +2548,11 @@ impl EntryNode { } /// Represents a parsed magic rule -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct MagicRule { id: usize, source: Option, - entries: EntryNode, + entries: dou::DouEntryNode, extensions: HashSet, /// score used for rule ranking score: u64, @@ -2573,7 +2574,9 @@ impl MagicRule { marked: &mut HashSet, ) -> Result, ()> { let mut exts = HashSet::new(); - self.entries.update_exts_rec(&mut exts, deps, marked)?; + self.entries + .get_or_de() + .update_exts_rec(&mut exts, deps, marked)?; Ok(exts) } @@ -2586,8 +2589,9 @@ impl MagicRule { marked: &mut HashSet, ) -> u64 { let mut score = 0; - score += self.entries.entry.test_strength; + score += self.entries.get_or_de().entry.test_strength; self.entries + .get_or_de() .update_score_rec(depth, &mut score, deps, marked); score } @@ -2622,7 +2626,7 @@ impl MagicRule { switch_endianness: bool, depth: usize, ) -> Result<(), Error> { - self.entries.matches( + self.entries.get_or_de().matches( self.source.as_deref(), magic, &mut MatchState::empty(), @@ -2650,7 +2654,7 @@ impl MagicRule { switch_endianness: bool, depth: usize, ) -> Result<(), Error> { - self.entries.matches( + self.entries.get_or_de().matches( self.source.as_deref(), magic, &mut MatchState::empty(), @@ -2671,8 +2675,13 @@ impl MagicRule { /// /// * `bool` - True if the rule is for text files pub fn is_text(&self) -> bool { - self.entries.entry.test.is_text() - && self.entries.children.iter().all(|e| e.entry.test.is_text()) + self.entries.get_or_de().entry.test.is_text() + && self + .entries + .get_or_de() + .children + .iter() + .all(|e| e.entry.test.is_text()) } /// Gets the rule's score used for ranking rules between them @@ -2702,7 +2711,7 @@ impl MagicRule { /// * `usize` - The rule's line number #[inline(always)] pub fn line(&self) -> usize { - self.entries.entry.line + self.entries.get_or_de().entry.line } /// Gets all the file extensions associated to the rule @@ -2716,7 +2725,7 @@ impl MagicRule { } } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] struct DependencyRule { name: String, rule: MagicRule, @@ -2727,7 +2736,7 @@ struct DependencyRule { /// # Methods /// /// * `open` - Opens a magic file from a path -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct MagicSource { rules: Vec, dependencies: HashMap, @@ -3030,7 +3039,7 @@ impl<'m> Magic<'m> { } /// Represents a database of [`MagicRule`] -#[derive(Debug, Default, Clone, Serialize, Deserialize)] +#[derive(Debug, Default, Serialize, Deserialize)] pub struct MagicDb { rule_id: usize, rules: Vec, diff --git a/pure-magic/src/parser.rs b/pure-magic/src/parser.rs index 89ce19b..1f126cf 100644 --- a/pure-magic/src/parser.rs +++ b/pure-magic/src/parser.rs @@ -672,9 +672,10 @@ impl Use { let mut message = None; if let Some(msg_pair) = pairs.next() - && !msg_pair.as_str().is_empty() { - message = Some(Message::from_pair(msg_pair)?); - }; + && !msg_pair.as_str().is_empty() + { + message = Some(Message::from_pair(msg_pair)?); + }; Ok(Self { line, @@ -1443,9 +1444,10 @@ impl MagicRule { let mut message = None; if let Some(msg) = pairs.next() - && !msg.as_str().is_empty() { - message = Some(Message::from_pair(msg)?) - } + && !msg.as_str().is_empty() + { + message = Some(Message::from_pair(msg)?) + } items.push(Entry::Match( span, @@ -1475,7 +1477,7 @@ impl MagicRule { Ok(Self { id: 0, source, - entries, + entries: entries.into(), extensions: HashSet::new(), score: 0, finalized: false, diff --git a/python/Cargo.toml b/python/Cargo.toml index 2118a26..c69a0aa 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -16,4 +16,4 @@ crate-type = ["cdylib"] [dependencies] pyo3 = "0.27.0" magic-db = { path = "../magic-db" } -pure-magic = { path = "../pure-magic" } +pure-magic = { path = "../pure-magic", features = ["sync"] } diff --git a/wiza/src/main.rs b/wiza/src/main.rs index 7e31a6e..d537e6b 100644 --- a/wiza/src/main.rs +++ b/wiza/src/main.rs @@ -240,16 +240,19 @@ impl Command { f.extension().and_then(|e| e.to_str()) }; + let start = Instant::now(); let Ok(magic) = db.first_magic(&mut file, ext).inspect_err(|e| { error!("failed to get magic file={}: {e}", f.to_string_lossy()) }) else { continue; }; + let scan_time = Instant::now().duration_since(start); if !o.json { println!( - "{} source:{} strength:{} mime:{} magic:{}", + "{} eval_time_us:{:.2} source:{} strength:{} mime:{} magic:{}", f.to_string_lossy(), + (scan_time.as_nanos() as f64) / 1_000.0, magic.source().unwrap_or(&Cow::Borrowed("none")), magic.strength(), magic.mime_type(),