Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions nexus/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sklearn.cluster import KMeans
import os
from dotenv import load_dotenv
import re

load_dotenv()

Expand All @@ -19,6 +20,34 @@ def __init__(self):
self.utils = Utils()
self.atlas = AtlasClient()

def _neutralize_for_log(self, value):
"""
Neutralize string for safe log output by escaping special characters
that could be used for log injection (CWE-117).
- Replace newlines, carriage returns, tabs with escape sequences
- Remove ANSI escape sequences
- If input is a list, sanitize each element
"""
ansi_escape = re.compile(r'''
\x1B # ESC
(?: # 7-bit C1 Fe (except CSI)
[@-Z\\-_]
| # or [ for CSI, followed by any chars up to 'm'
\[ [0-?]* [ -/]* [@-~]
)
''', re.VERBOSE)
def neutralize_str(s):
if not isinstance(s, str):
s = str(s)
s = ansi_escape.sub('', s)
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
return s
# Handle lists (like 'users' variable)
if isinstance(value, list):
return [neutralize_str(v) for v in value]
else:
return neutralize_str(value)

def create_social_profile_tweepy(self, map_name: str, map_description: str, users: List[str], outdir: str):
"""Create social profile with tweepy as tweet source

Expand Down Expand Up @@ -68,14 +97,16 @@ def create_social_profile_sns(self,


for user in tqdm(users):
user_safe = self._neutralize_for_log(user)
users_safe = self._neutralize_for_log(users)
try:
logger.info(f"Loading {user}'s tweets from disk")
logger.info(f"Loading {user_safe}'s tweets from disk")
data_path = os.path.join(outdir, f"{user}_tweets.jsonl")
with jsonlines.open(data_path, mode="r") as tweets:
for tweet in tweets:
all_tweets.append(tweet)
except BaseException:
logger.info(f"Not on disk! scraping {users}'s tweets now")
logger.info(f"Not on disk! scraping {users_safe}'s tweets now")
tweets = self.utils.user_lookup_sns(user, 10000)
with jsonlines.open(f'{outdir}/{user}_tweets.jsonl', mode='a') as writer:
for idx, tweet in enumerate(tweets):
Expand Down Expand Up @@ -144,4 +175,4 @@ def create_social_profile_sns(self,
map_description="A social profile of the latest POTUS Joe Biden, with Nomic's text embedder created by Yuvanesh Anand",
users=["JoeBiden", "POTUS"],
topics=True,
embedding_path="embeddings/JoeBiden.npy")
embedding_path="embeddings/JoeBiden.npy")