diff --git a/CHAT_CLEANUP_GUIDE.md b/CHAT_CLEANUP_GUIDE.md new file mode 100644 index 0000000..a0cd591 --- /dev/null +++ b/CHAT_CLEANUP_GUIDE.md @@ -0,0 +1,160 @@ +# ChatGPT Export Cleanup Tool + +## The Problem: Metadata Bloat + +When you export your ChatGPT conversation history, you often encounter a frustrating situation: + +- **PDF Export (70MB)**: Smaller but rigid - just "pictures of text" that are hard for computers to process +- **JSON Export (300MB)**: Massive because it stores extensive metadata for every message: + - Timestamps + - Model IDs + - Message IDs + - Parent node relationships + - System metadata + +For every "Hello" you typed, ChatGPT stores 20+ lines of structural code! + +## The Solution + +This tool strips the 300MB JSON down to just the human-readable conversation text, reducing it to approximately **15-20MB** of pure content. + +## How to Use + +### Step 1: Prepare Your Files + +1. Create a new folder for your chat cleanup +2. Export your ChatGPT conversations as JSON (from ChatGPT settings) +3. Place the exported JSON file in the folder and rename it to `chat_history.json` +4. Copy `clean_my_chat.py` into the same folder + +Your folder structure should look like: +``` +my-chat-cleanup/ +├── chat_history.json (your 300MB export) +└── clean_my_chat.py (this script) +``` + +### Step 2: Run the Script + +Open a terminal in the folder and run: + +```bash +python clean_my_chat.py +``` + +Or if you're using Python 3 specifically: + +```bash +python3 clean_my_chat.py +``` + +The script will: +1. Load your JSON file (this may take 30-60 seconds for large files) +2. Extract all conversations and their messages +3. Create `chat_history_clean.txt` with just the text content + +### Step 3: Explore Your Clean Data + +You now have a lightweight text file that you can: + +#### Option 1: Ask Questions (Google NotebookLM) +- Upload `chat_history_clean.txt` to [NotebookLM](https://notebooklm.google.com/) +- Ask questions like: + - "Create a timeline of my work on the Python project" + - "What books did I mention in 2023?" + - "Summarize my conversations about machine learning" + +#### Option 2: Visual Search (VS Code) +- Open the file in [VS Code](https://code.visualstudio.com/) (free) +- Use `Ctrl+F` (Windows/Linux) or `Cmd+F` (Mac) to find keywords instantly +- Use the minimap on the right to quickly scroll through months of conversations + +#### Option 3: Local AI (GPT4All - Private) +- Download [GPT4All](https://gpt4all.io/) +- Use the "LocalDocs" feature +- Point it at your `chat_history_clean.txt` +- Chat with a local AI about your documents without data leaving your computer + +## Output Format + +The cleaned file will have this structure: + +``` +================================================== +DATE: 2024-01-15 | TITLE: Python Help +================================================== +USER: How do I read a JSON file in Python? + +ASSISTANT: You can read a JSON file in Python using the json module... + +USER: Thanks! Can you show me an example? + +ASSISTANT: Sure! Here's a complete example... + + +================================================== +DATE: 2024-01-16 | TITLE: Recipe Ideas +================================================== +USER: Give me some healthy breakfast ideas + +ASSISTANT: Here are some nutritious breakfast options... +``` + +## Technical Details + +### What Gets Removed +- Message IDs and parent/child relationships +- Model metadata (model version, temperature settings, etc.) +- Plugin and tool invocation data +- System timestamps (except conversation creation date) +- Conversation state management data +- User account metadata + +### What Gets Kept +- Conversation titles +- Creation dates (in readable format) +- All user messages +- All assistant responses +- Original conversation order + +### File Size Reduction +Typical reduction: **85-95%** of original file size + +Example: +- Original JSON: 300MB +- Cleaned text: 15-20MB +- Reduction: ~94% + +## Troubleshooting + +### "File not found" error +- Make sure `chat_history.json` is in the same folder as the script +- Check that the filename is exactly `chat_history.json` (case-sensitive on Linux/Mac) + +### "Invalid JSON" error +- Your export file may be corrupted +- Try exporting again from ChatGPT settings +- Open the JSON file in a text editor to verify it's valid JSON + +### Script runs but output is empty +- Your JSON may be in a different format (older export version) +- Check the `mapping` structure in your JSON file +- The script is optimized for the standard OpenAI export format (as of 2024) + +### Out of memory errors +- For extremely large files (>500MB), you may need to process in chunks +- Close other applications to free up RAM +- Consider using a machine with more memory + +## Requirements + +- Python 3.6 or higher +- No external dependencies (uses only standard library) + +## License + +This tool is provided as-is for personal use in managing your ChatGPT conversation exports. + +## Privacy Note + +This script runs entirely on your local machine. No data is uploaded or transmitted anywhere. Your conversation history stays on your computer. diff --git a/clean_my_chat.py b/clean_my_chat.py new file mode 100644 index 0000000..4c7160c --- /dev/null +++ b/clean_my_chat.py @@ -0,0 +1,66 @@ +import json +import datetime + +# CONFIGURATION +input_file = 'chat_history.json' +output_file = 'chat_history_clean.txt' + +def timestamp_to_date(ts): + if not ts: return "Unknown Date" + return datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d') + +print(f"Loading {input_file}... (This might take 30 seconds)") + +try: + with open(input_file, 'r', encoding='utf-8') as f: + data = json.load(f) +except Exception as e: + print(f"Error loading file: {e}") + exit() + +print(f"Found {len(data)} conversations. Extracting text...") + +with open(output_file, 'w', encoding='utf-8') as f: + # Loop through every conversation + for convo in data: + title = convo.get('title', 'Untitled Chat') + create_time = convo.get('create_time') + date_str = timestamp_to_date(create_time) + + # Write Header for this chat + f.write(f"\n\n{'='*50}\n") + f.write(f"DATE: {date_str} | TITLE: {title}\n") + f.write(f"{'='*50}\n") + + # Reconstruct the conversation from the 'mapping' tree + mapping = convo.get('mapping', {}) + current_node = convo.get('current_node') + + # We have to trace backwards from the last message, then reverse it + messages = [] + while current_node: + node = mapping.get(current_node) + if not node: break + + message = node.get('message') + if message: + author_role = message.get('author', {}).get('role') + content = message.get('content', {}).get('parts', []) + + # Extract actual text content + text_content = "" + if content and len(content) > 0: + text_content = "".join([str(x) for x in content if isinstance(x, str)]) + + if text_content and author_role in ['user', 'assistant']: + # Format: "User: [text]" + messages.append(f"{author_role.upper()}: {text_content}") + + current_node = node.get('parent') + + # Reverse to put in chronological order and write to file + for msg in reversed(messages): + f.write(msg + "\n\n") + +print(f"SUCCESS! Created {output_file}") +print("You can now open this file in any text editor or upload it to AI tools.")