Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pure-magic/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ repository.workspace = true
keywords = ["magic", "file", "mime", "identification", "libmagic"]
description = "Safe Rust re-implementation of libmagic"

[features]
default = []
sync = []

[dependencies]
chrono = "0.4.41"
dyf = { version = "0.1.1", features = ["serde"] }
Expand Down
162 changes: 162 additions & 0 deletions pure-magic/src/dou.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#[cfg(not(feature = "sync"))]
pub(crate) use default::*;

#[cfg(feature = "sync")]
pub(crate) use sync::*;

#[cfg(not(feature = "sync"))]
mod default {
use std::cell::{OnceCell, RefCell};

use serde::{Deserialize, Serialize};

use crate::EntryNode;

/// Deserialize-on-Use wrapper for an `EntryNode`.
///
/// This struct implements the "Deserialize on Use" pattern, where the `EntryNode` is
/// deserialized only when first accessed via `get_or_de()`. The serialized form is stored
/// in a `RefCell<Option<Vec<u8>>>`, allowing for interior mutability, while the actual
/// `EntryNode` is stored in a `OnceCell`, ensuring it is deserialized at most once.
///
/// The serialized data is consumed during deserialization, ensuring it is only used once.
#[derive(Debug, Deserialize)]
pub(crate) struct DouEntryNode {
ser: RefCell<Option<Vec<u8>>>,
#[serde(skip)]
entry: OnceCell<EntryNode>,
}

impl From<EntryNode> for DouEntryNode {
fn from(value: EntryNode) -> Self {
let cell = OnceCell::new();
cell.set(value).unwrap();
Self {
ser: RefCell::new(None),
entry: cell,
}
}
}

impl Serialize for DouEntryNode {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
#[derive(Debug, Serialize)]
struct Tmp {
ser: RefCell<Option<Vec<u8>>>,
}

let ser = self.ser.borrow_mut();
let tmp = if ser.is_none() {
Tmp {
ser: RefCell::new(Some(
bincode::serde::encode_to_vec(
self.entry.get().unwrap(),
bincode::config::standard(),
)
.unwrap(),
)),
}
} else {
Tmp {
ser: self.ser.clone(),
}
};

tmp.serialize(serializer)
}
}

impl DouEntryNode {
/// Returns a reference to the entry node.
pub(crate) fn get_or_de(&self) -> &EntryNode {
self.entry.get_or_init(|| {
let ser = self.ser.borrow_mut().take().unwrap();
let (e, _) =
bincode::serde::decode_from_slice(&ser, bincode::config::standard()).unwrap();
e
})
}
}
}

#[cfg(feature = "sync")]
mod sync {
use std::sync::{OnceLock, RwLock};

use serde::{Deserialize, Serialize};

use crate::EntryNode;

/// Thread-safe Deserialize-on-Use wrapper for an `EntryNode`.
///
/// This struct implements the "Deserialize on Use" pattern in a thread-safe manner.
/// The `EntryNode` is deserialized only when first accessed via `get_or_de()`.
/// The serialized form is stored in an `RwLock<Option<Vec<u8>>>`, allowing for concurrent
/// read access or exclusive write access. The actual `EntryNode` is stored in a `OnceLock`,
/// ensuring thread-safe, one-time deserialization.
///
/// The serialized data is consumed during deserialization, ensuring it is only used once.
#[derive(Debug, Deserialize)]
pub(crate) struct DouEntryNode {
ser: RwLock<Option<Vec<u8>>>,
#[serde(skip)]
entry: OnceLock<EntryNode>,
}

impl From<EntryNode> for DouEntryNode {
fn from(value: EntryNode) -> Self {
let cell = OnceLock::new();
cell.set(value).unwrap();
Self {
ser: RwLock::new(None),
entry: cell,
}
}
}

impl Serialize for DouEntryNode {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
#[derive(Debug, Serialize)]
struct Tmp {
ser: RwLock<Option<Vec<u8>>>,
}

let ser = self.ser.read().unwrap();
let tmp = if ser.is_none() {
Tmp {
ser: RwLock::new(Some(
bincode::serde::encode_to_vec(
self.entry.get().unwrap(),
bincode::config::standard(),
)
.unwrap(),
)),
}
} else {
Tmp {
ser: RwLock::new(ser.clone()),
}
};

tmp.serialize(serializer)
}
}

impl DouEntryNode {
/// Returns a reference to the entry node.
pub(crate) fn get_or_de(&self) -> &EntryNode {
self.entry.get_or_init(|| {
let ser = self.ser.write().unwrap().take().unwrap();
let (e, _) =
bincode::serde::decode_from_slice(&ser, bincode::config::standard()).unwrap();
e
})
}
}
}
35 changes: 22 additions & 13 deletions pure-magic/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ use crate::{
utils::{decode_id3, find_json_boundaries, run_utf8_validation},
};

mod dou;
mod numeric;
mod parser;
mod utils;
Expand Down Expand Up @@ -2320,7 +2321,7 @@ enum Entry<'span> {
Flag(Span<'span>, Flag),
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
struct EntryNode {
root: bool,
entry: Match,
Expand Down Expand Up @@ -2547,11 +2548,11 @@ impl EntryNode {
}

/// Represents a parsed magic rule
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct MagicRule {
id: usize,
source: Option<String>,
entries: EntryNode,
entries: dou::DouEntryNode,
extensions: HashSet<String>,
/// score used for rule ranking
score: u64,
Expand All @@ -2573,7 +2574,9 @@ impl MagicRule {
marked: &mut HashSet<String>,
) -> Result<HashSet<String>, ()> {
let mut exts = HashSet::new();
self.entries.update_exts_rec(&mut exts, deps, marked)?;
self.entries
.get_or_de()
.update_exts_rec(&mut exts, deps, marked)?;
Ok(exts)
}

Expand All @@ -2586,8 +2589,9 @@ impl MagicRule {
marked: &mut HashSet<String>,
) -> u64 {
let mut score = 0;
score += self.entries.entry.test_strength;
score += self.entries.get_or_de().entry.test_strength;
self.entries
.get_or_de()
.update_score_rec(depth, &mut score, deps, marked);
score
}
Expand Down Expand Up @@ -2622,7 +2626,7 @@ impl MagicRule {
switch_endianness: bool,
depth: usize,
) -> Result<(), Error> {
self.entries.matches(
self.entries.get_or_de().matches(
self.source.as_deref(),
magic,
&mut MatchState::empty(),
Expand Down Expand Up @@ -2650,7 +2654,7 @@ impl MagicRule {
switch_endianness: bool,
depth: usize,
) -> Result<(), Error> {
self.entries.matches(
self.entries.get_or_de().matches(
self.source.as_deref(),
magic,
&mut MatchState::empty(),
Expand All @@ -2671,8 +2675,13 @@ impl MagicRule {
///
/// * `bool` - True if the rule is for text files
pub fn is_text(&self) -> bool {
self.entries.entry.test.is_text()
&& self.entries.children.iter().all(|e| e.entry.test.is_text())
self.entries.get_or_de().entry.test.is_text()
&& self
.entries
.get_or_de()
.children
.iter()
.all(|e| e.entry.test.is_text())
}

/// Gets the rule's score used for ranking rules between them
Expand Down Expand Up @@ -2702,7 +2711,7 @@ impl MagicRule {
/// * `usize` - The rule's line number
#[inline(always)]
pub fn line(&self) -> usize {
self.entries.entry.line
self.entries.get_or_de().entry.line
}

/// Gets all the file extensions associated to the rule
Expand All @@ -2716,7 +2725,7 @@ impl MagicRule {
}
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
struct DependencyRule {
name: String,
rule: MagicRule,
Expand All @@ -2727,7 +2736,7 @@ struct DependencyRule {
/// # Methods
///
/// * `open` - Opens a magic file from a path
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct MagicSource {
rules: Vec<MagicRule>,
dependencies: HashMap<String, DependencyRule>,
Expand Down Expand Up @@ -3030,7 +3039,7 @@ impl<'m> Magic<'m> {
}

/// Represents a database of [`MagicRule`]
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct MagicDb {
rule_id: usize,
rules: Vec<MagicRule>,
Expand Down
16 changes: 9 additions & 7 deletions pure-magic/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -672,9 +672,10 @@ impl Use {

let mut message = None;
if let Some(msg_pair) = pairs.next()
&& !msg_pair.as_str().is_empty() {
message = Some(Message::from_pair(msg_pair)?);
};
&& !msg_pair.as_str().is_empty()
{
message = Some(Message::from_pair(msg_pair)?);
};

Ok(Self {
line,
Expand Down Expand Up @@ -1443,9 +1444,10 @@ impl MagicRule {

let mut message = None;
if let Some(msg) = pairs.next()
&& !msg.as_str().is_empty() {
message = Some(Message::from_pair(msg)?)
}
&& !msg.as_str().is_empty()
{
message = Some(Message::from_pair(msg)?)
}

items.push(Entry::Match(
span,
Expand Down Expand Up @@ -1475,7 +1477,7 @@ impl MagicRule {
Ok(Self {
id: 0,
source,
entries,
entries: entries.into(),
extensions: HashSet::new(),
score: 0,
finalized: false,
Expand Down
2 changes: 1 addition & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ crate-type = ["cdylib"]
[dependencies]
pyo3 = "0.27.0"
magic-db = { path = "../magic-db" }
pure-magic = { path = "../pure-magic" }
pure-magic = { path = "../pure-magic", features = ["sync"] }
5 changes: 4 additions & 1 deletion wiza/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,16 +240,19 @@ impl Command {
f.extension().and_then(|e| e.to_str())
};

let start = Instant::now();
let Ok(magic) = db.first_magic(&mut file, ext).inspect_err(|e| {
error!("failed to get magic file={}: {e}", f.to_string_lossy())
}) else {
continue;
};
let scan_time = Instant::now().duration_since(start);

if !o.json {
println!(
"{} source:{} strength:{} mime:{} magic:{}",
"{} eval_time_us:{:.2} source:{} strength:{} mime:{} magic:{}",
f.to_string_lossy(),
(scan_time.as_nanos() as f64) / 1_000.0,
magic.source().unwrap_or(&Cow::Borrowed("none")),
magic.strength(),
magic.mime_type(),
Expand Down
Loading