From 41c67a7295033b0502ce579a0e47e51e7781c49f Mon Sep 17 00:00:00 2001 From: cheran Date: Sun, 12 Apr 2026 20:56:51 +0530 Subject: [PATCH] Added campus placement copilot project --- .env.example | 13 ++ .gitignore | 54 ++++---- README.md | 285 ++++++++++++++++++++++---------------- app.py | 112 +++++++++++++++ backend/ingest.py | 145 +++++++++++++++++++ backend/search.py | 100 ++++++++++++++ cd | 0 data/placement_data.json | 44 ++++++ frontend/index.html | 77 +++++++++++ frontend/script.js | 154 +++++++++++++++++++++ frontend/style.css | 291 +++++++++++++++++++++++++++++++++++++++ requirements.txt | 8 ++ scripts/ingest_data.py | 40 ++++++ scripts/quick_ingest.py | 44 ++++++ 14 files changed, 1223 insertions(+), 144 deletions(-) create mode 100644 .env.example create mode 100644 app.py create mode 100644 backend/ingest.py create mode 100644 backend/search.py create mode 100644 cd create mode 100644 data/placement_data.json create mode 100644 frontend/index.html create mode 100644 frontend/script.js create mode 100644 frontend/style.css create mode 100644 requirements.txt create mode 100644 scripts/ingest_data.py create mode 100644 scripts/quick_ingest.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000..5edee28901 --- /dev/null +++ b/.env.example @@ -0,0 +1,13 @@ +# Endee Configuration +ENDEE_API_KEY=your_endee_api_key_here +ENDEE_URL=https://api.endee.io +ENDEE_INDEX_NAME=placement_copilot +ENDEE_MODE=local # "local" for mock (default), "production" for real Endee + +# Embedding Model +MODEL_NAME=all-MiniLM-L6-v2 + +# Server Configuration +HOST=0.0.0.0 +PORT=8000 +DEBUG=True diff --git a/.gitignore b/.gitignore index ddd10fad2b..8cbe1df9f8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,32 +1,34 @@ - -# Ignore build directory +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +env/ +ENV/ +*.egg-info/ +dist/ build/ -# Ignore build like directory -build*/ -# Ignore all files in the build directory -build/* - -# Ignore tests build directory -tests/build/ -tests/build*/ - -# Test binaries -tests/**/ndd_filter_test - -# macOS debug symbols -*.dSYM/ -# Sometimes data files are created for tetsing -data/ -data/* +# Environment +.env +.env.local -# VS Code directory +# IDE .vscode/ -.vscode/* +.idea/ +*.swp +*.swo +*~ -# Frontend -frontend/ -frontend/* - -# DS Store +# OS .DS_Store +Thumbs.db + +# Data +*.json.backup + +# Logs +*.log +.venv/ diff --git a/README.md b/README.md index ac42738edd..106fea2823 100644 --- a/README.md +++ b/README.md @@ -1,139 +1,188 @@ -

- - - - Endee - -

- -

- High-performance open-source vector database for AI search, RAG, semantic search, and hybrid retrieval. -

- -

- Quick Start - Docs - License - Discord - Website - - -

- -

-Quick StartWhy EndeeUse CasesFeaturesAPI and ClientsDocsContact -

- -# Endee: Open-Source Vector Database for AI Search - -**Endee** is a high-performance open-source vector database built for AI search and retrieval workloads. It is designed for teams building **RAG pipelines**, **semantic search**, **hybrid search**, recommendation systems, and filtered vector retrieval APIs that need production-oriented performance and control. - -Endee combines vector search with filtering, sparse retrieval support, backup workflows, and deployment flexibility across local builds and Docker-based environments. The project is implemented in C++ and optimized for modern CPU targets, including AVX2, AVX512, NEON, and SVE2. - -If you want the fastest path to evaluate Endee locally, start with the [Getting Started guide](./docs/getting-started.md) or the hosted docs at [docs.endee.io](https://docs.endee.io/quick-start). - -## Why Endee - -- Built as a dedicated vector database for AI applications, search systems, and retrieval-heavy workloads. -- Supports dense vector retrieval plus sparse search capabilities for hybrid search use cases. -- Includes payload filtering for metadata-aware retrieval and application-specific query logic. -- Ships with operational features already documented in this repo, including backup flows and runtime observability. -- Offers flexible deployment paths: local scripts, manual builds, Docker images, and prebuilt registry images. +# 🎓 Campus Placement Copilot + +> AI-powered RAG system to help engineering students prepare for campus placements using semantic search and LLM-generated answers. + +![RAG Pipeline](https://img.shields.io/badge/RAG-Pipeline-blue) ![Endee](https://img.shields.io/badge/Vector_DB-Endee-green) ![Groq](https://img.shields.io/badge/LLM-Groq_LLaMA3-orange) ![FastAPI](https://img.shields.io/badge/Backend-FastAPI-teal) + +--- + +## 🚀 What is this? + +Campus Placement Copilot is a **Retrieval-Augmented Generation (RAG)** application that helps students get accurate, context-aware answers about campus placement preparation — covering companies like TCS, Infosys, Wipro, Cognizant, and more. + +Students can ask questions like: +- *"How to crack TCS Ninja interview?"* +- *"What is the Infosys System Engineer selection process?"* +- *"Tips for Wipro NLTH exam?"* + +The system retrieves the most relevant information from a vector database and generates a precise answer using an LLM. + +--- + +## 🏗️ System Design +Student Query +│ +▼ +Frontend (HTML/CSS/JS) +│ POST /ask +▼ +FastAPI Backend (port 8000) +│ +▼ +SentenceTransformer +(all-MiniLM-L6-v2) +generates query embedding +│ +▼ +Endee Vector DB (HNSW Index) +searches top-K similar chunks +│ +▼ +Retrieved Context Chunks +│ +▼ +Groq LLaMA 3.1 (LLM) +generates final answer +│ +▼ +Answer + Sources → Student + +--- + +## 🛠️ Tech Stack + +| Component | Technology | +|---|---| +| Vector Database | **Endee** (Docker, HNSW index) | +| Embeddings | SentenceTransformers `all-MiniLM-L6-v2` | +| LLM | Groq `llama-3.1-8b-instant` | +| Backend | FastAPI + Python | +| Frontend | Vanilla HTML, CSS, JavaScript | +| Deployment | Docker + Uvicorn | + +--- + +## 🔍 How Endee is Used + +[Endee](https://github.com/endee-io/endee) is a high-performance open-source vector database built for speed and efficiency. + +In this project, Endee is used to: +1. **Store** document embeddings as 384-dimensional vectors using HNSW index +2. **Search** semantically similar chunks using cosine similarity +3. **Retrieve** top-K relevant context for RAG pipeline + +```python +# Create index +client.create_index( + name="placement_copilot", + dimension=384, + space_type="cosine", + precision=Precision.INT8 +) + +# Upsert vectors +index.upsert([{ + "id": "doc_0_chunk_0", + "vector": embedding, + "meta": {"text": chunk, "company": "TCS"} +}]) + +# Semantic search +results = index.query( + vector=query_embedding, + top_k=5, + ef=128 +) +``` -## Getting Started +--- -The full installation, build, Docker, runtime, and authentication instructions are in [docs/getting-started.md](./docs/getting-started.md). +## ⚙️ Setup Instructions -Fastest local path: +### Prerequisites +- Python 3.8+ +- Docker Desktop +- Groq API key (free at [console.groq.com](https://console.groq.com)) +### Step 1 — Clone the repo ```bash -chmod +x ./install.sh ./run.sh -./install.sh --release --avx2 -./run.sh +git clone https://github.com/YOUR_USERNAME/YOUR_REPO.git +cd campus-placement-copilot ``` -The server listens on port `8080`. For detailed setup paths, supported operating systems, CPU optimization flags, Docker usage, and authentication examples, use: - -- [Getting Started](./docs/getting-started.md) -- [Hosted Quick Start Docs](https://docs.endee.io/quick-start) - -## Use Cases - -### RAG and AI Retrieval - -Use Endee as the retrieval layer for question answering, chat assistants, copilots, and other RAG applications that need fast vector search with metadata-aware filtering. - -### Agentic AI and AI Agent Memory - -Use Endee as the long-term memory and context retrieval layer for AI agents built with frameworks like LangChain, CrewAI, AutoGen, and LlamaIndex. Store and retrieve past observations, tool outputs, conversation history, and domain knowledge mid-execution with low-latency filtered vector search, so your autonomous agents get the right context without stalling their reasoning loop. - -### Semantic Search - -Build semantic search experiences for documents, products, support content, and knowledge bases using vector similarity search instead of exact keyword-only matching. - -### Hybrid Search - -Combine dense retrieval, sparse vectors, and filtering to improve relevance for search workflows where both semantic understanding and term-level precision matter. - -### Recommendations and Matching - -Support recommendation, similarity matching, and nearest-neighbor retrieval workflows across text, embeddings, and other high-dimensional representations. - -## Features - -- **Vector search** for AI retrieval and semantic similarity workloads. -- **Hybrid retrieval support** with sparse vector capabilities documented in [docs/sparse.md](./docs/sparse.md). -- **Payload filtering** for structured retrieval logic documented in [docs/filter.md](./docs/filter.md). -- **Backup APIs and flows** documented in [docs/backup-system.md](./docs/backup-system.md). -- **Operational logging and instrumentation** documented in [docs/logs.md](./docs/logs.md) and [docs/mdbx-instrumentation.md](./docs/mdbx-instrumentation.md). -- **CPU-targeted builds** for AVX2, AVX512, NEON, and SVE2 deployments. -- **Docker deployment options** for local and server environments. - -## API and Clients - -Endee exposes an HTTP API for managing indexes and serving retrieval workloads. The current repo documentation and examples focus on running the server directly and calling its API endpoints. - -Current developer entry points: - -- [Getting Started](./docs/getting-started.md) for local build and run flows -- [Hosted Docs](https://docs.endee.io/quick-start) for product documentation -- [Release Notes 1.0.0](https://github.com/endee-io/endee/releases/tag/1.0.0) for recent platform changes - -## Docs and Links +### Step 2 — Start Endee Vector DB +```bash +docker run -d \ + -p 8080:8080 \ + -v ./endee-data:/data \ + --name endee-server \ + endeeio/endee-server:latest +``` -- [Getting Started](./docs/getting-started.md) -- [Hosted Documentation](https://docs.endee.io/quick-start) -- [Release Notes](https://github.com/endee-io/endee/releases/tag/1.0.0) -- [Sparse Search](./docs/sparse.md) -- [Filtering](./docs/filter.md) -- [Backups](./docs/backup-system.md) +### Step 3 — Install dependencies +```bash +pip install -r requirements.txt +``` -## Community and Contact +### Step 4 — Configure environment +Create `.env` file: +```env +GROQ_API_KEY=your_groq_api_key_here +GROQ_MODEL=llama-3.1-8b-instant +MODEL_NAME=all-MiniLM-L6-v2 +``` -- Join the community on [Discord](https://discord.gg/5HFGqDZQE3) -- Visit the website at [endee.io](https://endee.io/) -- For trademark or branding permissions, contact [enterprise@endee.io](mailto:enterprise@endee.io) +### Step 5 — Ingest data into Endee +```bash +python -c "import asyncio; from backend.ingest import ingest_data; asyncio.run(ingest_data())" +``` -## Contributing +### Step 6 — Start backend +```bash +python app.py +``` -We welcome contributions from the community to help make vector search faster and more accessible for everyone. +### Step 7 — Start frontend +```bash +cd frontend +python -m http.server 3000 +``` -- Submit pull requests for fixes, features, and improvements -- Report bugs or performance issues through GitHub issues -- Propose enhancements for search quality, performance, and deployment workflows +### Step 8 — Open browser +http://localhost:3000 -## License +--- -Endee is open source software licensed under the **Apache License 2.0**. See the [LICENSE](./LICENSE) file for full terms. +## 📁 Project Structure +campus-placement-copilot/ +├── app.py # FastAPI entry point +├── backend/ +│ ├── ingest.py # Data ingestion + Endee upsert +│ └── search.py # Semantic search + Groq LLM +├── frontend/ +│ ├── index.html # UI +│ ├── script.js # API calls +│ └── style.css # Styling +├── data/ +│ └── placement_data.json # Placement knowledge base +├── requirements.txt +└── README.md -## Trademark and Branding +--- -“Endee” and the Endee logo are trademarks of Endee Labs. +## 💡 Features -The Apache License 2.0 does not grant permission to use the Endee name, logos, or branding in a way that suggests endorsement or affiliation. +- ✅ Semantic search powered by Endee HNSW vector index +- ✅ RAG pipeline — retrieve then generate +- ✅ Groq LLaMA 3.1 for fast, accurate answers +- ✅ Source citations with every answer +- ✅ Coverage: TCS, Infosys, Wipro, Cognizant, and more -If you offer a hosted or managed service based on this software, you must use your own branding and avoid implying it is an official Endee service. +--- -## Third-Party Software +## 🔗 References -This project includes or depends on third-party software components licensed under their respective open-source licenses. Use of those components is governed by their own license terms. +- [Endee Vector DB](https://github.com/endee-io/endee) +- [Endee Documentation](https://docs.endee.io) +- [Groq API](https://console.groq.com) +- [SentenceTransformers](https://www.sbert.net) \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000..306ee4f2a5 --- /dev/null +++ b/app.py @@ -0,0 +1,112 @@ +from fastapi import FastAPI, HTTPException, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from pydantic import BaseModel +from typing import List, Dict, Any +import os +import traceback +from dotenv import load_dotenv + +from backend.search import search_documents, ask_question +from backend.ingest import ingest_data + +load_dotenv() + +app = FastAPI(title="Campus Placement Copilot", version="1.0.0") + +# CORS for frontend +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# Debug middleware - catches all exceptions and prints traceback +@app.middleware("http") +async def debug_exceptions(request: Request, call_next): + try: + response = await call_next(request) + return response + except Exception as e: + # Print full traceback to terminal + print("=" * 80) + print("EXCEPTION CAUGHT IN MIDDLEWARE:") + print(f"Path: {request.url.path}") + print(f"Method: {request.method}") + print("=" * 80) + print(traceback.format_exc()) + print("=" * 80) + + # Re-raise to let FastAPI handle it + raise + +# Request/Response Models +class SearchRequest(BaseModel): + query: str + top_k: int = 5 + +class SearchResponse(BaseModel): + query: str + results: List[Dict[str, Any]] + +class AskRequest(BaseModel): + question: str + top_k: int = 5 + +class AskResponse(BaseModel): + question: str + answer: str + sources: List[Dict[str, Any]] + +# Routes +@app.get("/health") +async def health(): + return {"status": "healthy"} + +@app.post("/search", response_model=SearchResponse) +async def search(request: SearchRequest): + try: + results = await search_documents(request.query, request.top_k) + return SearchResponse(query=request.query, results=results) + except Exception as e: + # Print traceback for debugging + print("=" * 80) + print(f"ERROR in /search endpoint:") + print(traceback.format_exc()) + print("=" * 80) + + # Return error JSON during development + return JSONResponse( + status_code=500, + content={"error": str(e), "detail": traceback.format_exc()} + ) + +@app.post("/ask", response_model=AskResponse) +async def ask(request: AskRequest): + try: + answer, sources = await ask_question(request.question, request.top_k) + return AskResponse(question=request.question, answer=answer, sources=sources) + except Exception as e: + # Print traceback for debugging + print("=" * 80) + print(f"ERROR in /ask endpoint:") + print(traceback.format_exc()) + print("=" * 80) + + # Return error JSON during development + return JSONResponse( + status_code=500, + content={"error": str(e), "detail": traceback.format_exc()} + ) + +if __name__ == "__main__": + import uvicorn + uvicorn.run( + "app:app", + host=os.getenv("HOST", "0.0.0.0"), + port=int(os.getenv("PORT", 8000)), + reload=os.getenv("DEBUG", "False").lower() == "true" + ) diff --git a/backend/ingest.py b/backend/ingest.py new file mode 100644 index 0000000000..eda71fb227 --- /dev/null +++ b/backend/ingest.py @@ -0,0 +1,145 @@ +import os +import json +from typing import List, Dict, Any +from sentence_transformers import SentenceTransformer +from endee import Endee, Precision +from dotenv import load_dotenv + +load_dotenv() + +# Load model once +model = SentenceTransformer(os.getenv("MODEL_NAME", "all-MiniLM-L6-v2")) + +# Endee client — local Docker server +client = Endee() +client.set_base_url("http://localhost:8080/api/v1") + +INDEX_NAME = "placement_copilot" +DIMENSION = 384 # all-MiniLM-L6-v2 output size + + +def get_or_create_index(): + """Create index if not exists, return index object""" + try: + index = client.get_index(INDEX_NAME) + print(f"✓ Index already exists: {INDEX_NAME}") + return index + except Exception: + pass + + try: + client.create_index( + name=INDEX_NAME, + dimension=DIMENSION, + space_type="cosine", + precision=Precision.INT8 + ) + print(f"✓ Created Endee index: {INDEX_NAME}") + return client.get_index(INDEX_NAME) + except Exception as e: + print(f"✗ Failed to create index: {e}") + raise + + +def generate_embedding(text: str) -> List[float]: + """Generate embedding for a single text""" + return model.encode(text).tolist() + + +def generate_embeddings(texts: List[str]) -> List[List[float]]: + """Generate embeddings for multiple texts""" + return model.encode(texts).tolist() + + +async def search_endee(query_embedding: List[float], top_k: int = 5) -> List[Dict]: + """Search Endee index using query embedding""" + try: + index = client.get_index(INDEX_NAME) + results = index.query( + vector=query_embedding, + top_k=top_k, + ef=128, + include_vectors=False + ) + print(f"✓ Endee search returned {len(results)} results") + return results + except Exception as e: + print(f"✗ Endee search failed: {e}") + return [] + + +async def ingest_data(file_path: str = "data/placement_data.json") -> Dict: + """Load JSON data, chunk, embed, and upsert to Endee""" + + # Load JSON + with open(file_path, 'r', encoding='utf-8') as f: + documents = json.load(f) + + if not isinstance(documents, list): + raise ValueError("JSON must contain a list of documents") + + print(f"Loaded {len(documents)} documents") + + # Get or create Endee index + index = get_or_create_index() + + # Process documents into vectors + vectors = [] + for idx, doc in enumerate(documents): + content = doc.get("content", "") + metadata = doc.get("metadata", {}) + + if not content: + continue + + chunks = [content] if len(content) <= 1000 else split_into_chunks(content, 1000) + embeddings = generate_embeddings(chunks) + + for chunk_idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)): + vectors.append({ + "id": f"doc_{idx}_chunk_{chunk_idx}", + "vector": embedding, + "meta": { + **metadata, + "text": chunk, + "chunk_index": chunk_idx + } + }) + + print(f"Generated {len(vectors)} vectors") + + # Upsert in batches of 1000 (Endee limit) + batch_size = 1000 + for i in range(0, len(vectors), batch_size): + batch = vectors[i:i + batch_size] + index.upsert(batch) + print(f"✓ Upserted batch {i // batch_size + 1} ({len(batch)} vectors)") + + print(f"✓ Ingested {len(vectors)} vectors into Endee index '{INDEX_NAME}'") + return {"ingested": len(vectors), "documents": len(documents)} + + +def split_into_chunks(text: str, chunk_size: int = 1000) -> List[str]: + """Split text into chunks at sentence boundaries""" + sentences = text.replace('. ', '.\n').split('\n') + chunks = [] + current_chunk = [] + current_length = 0 + + for sentence in sentences: + sentence = sentence.strip() + if not sentence: + continue + + if current_length + len(sentence) > chunk_size and current_chunk: + chunks.append(' '.join(current_chunk)) + current_chunk = [sentence] + current_length = len(sentence) + else: + current_chunk.append(sentence) + current_length += len(sentence) + + if current_chunk: + chunks.append(' '.join(current_chunk)) + + return chunks \ No newline at end of file diff --git a/backend/search.py b/backend/search.py new file mode 100644 index 0000000000..a48480ceca --- /dev/null +++ b/backend/search.py @@ -0,0 +1,100 @@ +import os +import httpx +from typing import List, Dict, Tuple +from dotenv import load_dotenv +from backend.ingest import generate_embedding, search_endee + +load_dotenv() + +GROQ_API_KEY = os.getenv("GROQ_API_KEY") +GROQ_MODEL = "llama-3.1-8b-instant" + + +async def search_documents(query: str, top_k: int = 5) -> List[Dict]: + """Search for relevant documents using Endee vector search""" + + query_embedding = generate_embedding(query) + results = await search_endee(query_embedding, top_k) + + formatted_results = [] + for result in results: + meta = result.get("meta", {}) + formatted_results.append({ + "text": meta.get("text", ""), + "score": result.get("similarity", 0.0), + "metadata": {k: v for k, v in meta.items() if k != "text"} + }) + + return formatted_results + + +async def ask_question(question: str, top_k: int = 5) -> Tuple[str, List[Dict]]: + """Answer question using RAG — Endee retrieval + Groq LLM""" + + query_embedding = generate_embedding(question) + results = await search_endee(query_embedding, top_k) + + if not results: + return "No relevant information found in the database.", [] + + # Build sources and context + sources = [] + context_parts = [] + + for idx, result in enumerate(results, 1): + meta = result.get("meta", {}) + text = meta.get("text", "") + + sources.append({ + "text": text, + "score": result.get("similarity", 0.0), + "metadata": {k: v for k, v in meta.items() if k != "text"} + }) + context_parts.append(f"[{idx}] {text}") + + context = "\n\n".join(context_parts) + + # Call Groq LLM + answer = await call_groq(question, context) + + return answer, sources + + +async def call_groq(question: str, context: str) -> str: + """Call Groq LLaMA3 with retrieved context""" + + if not GROQ_API_KEY: + # Fallback if no Groq key + return f"Based on the retrieved information:\n\n{context}" + + prompt = f"""You are a Campus Placement Copilot. Answer the student's question using only the context below. +If the answer is not in the context, say "I don't have information on that." + +Context: +{context} + +Question: {question} + +Answer:""" + + try: + async with httpx.AsyncClient(timeout=30) as http: + response = await http.post( + "https://api.groq.com/openai/v1/chat/completions", + headers={ + "Authorization": f"Bearer {GROQ_API_KEY}", + "Content-Type": "application/json" + }, + json={ + "model": GROQ_MODEL, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 512, + "temperature": 0.3 + } + ) + response.raise_for_status() + return response.json()["choices"][0]["message"]["content"].strip() + + except Exception as e: + print(f"✗ Groq API error: {e}") + return f"Based on the retrieved information:\n\n{context}" \ No newline at end of file diff --git a/cd b/cd new file mode 100644 index 0000000000..e69de29bb2 diff --git a/data/placement_data.json b/data/placement_data.json new file mode 100644 index 0000000000..ffbb3a8ef2 --- /dev/null +++ b/data/placement_data.json @@ -0,0 +1,44 @@ +[ + { + "content": "How to crack TCS Ninja Interview: The interview process consists of three rounds - Ninja Test, Technical Interview, and HR Interview. For the Ninja Test, focus on aptitude questions, basic programming concepts in C/C++/Java, and verbal ability. In the technical interview, expect questions on data structures, OOPs concepts, database management, and your final year project. Common questions include: Explain polymorphism, difference between SQL and NoSQL, write a program to find factorial using recursion. For HR round, prepare answers for 'Tell me about yourself', 'Why TCS?', 'Where do you see yourself in 5 years?'", + "metadata": { + "category": "interview_experience", + "company": "TCS", + "role": "Ninja", + "difficulty": "easy" + } + }, + { + "content": "Top 20 DSA Patterns for Placement: 1) Two Pointers - used for sorted arrays, palindrome problems. 2) Sliding Window - subarray problems, longest substring. 3) Binary Search - search in sorted arrays. 4) BFS/DFS - tree and graph traversal. 5) Dynamic Programming - optimization problems. 6) Greedy Algorithms - interval scheduling. 7) Backtracking - combination/ permutation problems. 8) Hash Map - frequency counting, two sum. 9) Heap/Priority Queue - top K elements. 10) Trie - prefix problems. Master these patterns and you can solve 80% of interview questions.", + "metadata": { + "category": "dsa_notes", + "topic": "patterns", + "difficulty": "medium" + } + }, + { + "content": "Common HR Interview Questions and Answers: 1) Tell me about yourself - Start with education, mention key projects, highlight relevant skills. 2) What are your strengths? - Mention 2-3 strengths with examples. 3) What are your weaknesses? - Be honest but show improvement. 4) Why should we hire you? - Match your skills with job requirements. 5) Where do you see yourself in 5 years? - Show ambition but be realistic. 6) Why do you want to work here? - Research the company. 7) What is your greatest achievement? - Use STAR method. 8) How do you handle pressure? - Give specific examples. 9) Do you prefer working alone or in a team? - Show flexibility. 10) Any questions for us? - Always have 2-3 questions ready.", + "metadata": { + "category": "hr_questions", + "topic": "common_questions", + "difficulty": "easy" + } + }, + { + "content": "Resume Writing Tips for Freshers: 1) Keep it to one page - recruiters spend only 6-7 seconds initially. 2) Use action verbs - 'Developed', 'Implemented', 'Optimized' instead of 'Worked on'. 3) Quantify achievements - 'Improved performance by 30%' not 'Improved performance'. 4) Tailor for each company - match keywords from job description. 5) Include projects with tech stack - mention technologies used. 6) Add GitHub/Live links - showcase your work. 7) Education section - include CGPA if above 7.5. 8) Skills section - categorize into Programming, Web Dev, Tools, etc. 9) Certifications - include relevant ones only. 10) Avoid: photos, hobbies unless relevant, personal details like address, fake information.", + "metadata": { + "category": "resume_tips", + "topic": "freshers", + "difficulty": "easy" + } + }, + { + "content": "Infosys System Engineer Interview Experience: Round 1 - Online Assessment with 3 sections: Reasoning (15 questions), Math (10 questions), and Verbal (10 questions). Time limit: 45 minutes. Round 2 - Technical Interview focused on: One programming question (array manipulation or string operations), Database normalization and SQL queries, OOPs concepts with real-world examples, Discussion about internship projects. Key preparation areas: Practice coding on arrays and strings, revise DBMS basics, understand SDLC models, prepare 2-3 projects thoroughly. Round 3 - HR Interview - Standard questions about relocation, night shifts, career goals, and family background.", + "metadata": { + "category": "interview_experience", + "company": "Infosys", + "role": "System Engineer", + "difficulty": "easy" + } + } +] diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000000..740f656328 --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,77 @@ + + + + + + Campus Placement Copilot + + + +
+
+

Campus Placement Copilot

+

Ask questions about interviews, DSA, HR rounds, and resume tips

+
+ +
+ +
+
+ + +
+
+ + + + + + + + + + + + + + +
+
+

👋 Welcome!

+

I can help you with placement preparation. Try asking:

+
    +
  • How to crack TCS Ninja interview?
  • +
  • What are important DSA patterns?
  • +
  • Common HR interview questions?
  • +
  • Tips for writing a good resume?
  • +
  • Infosys interview experience?
  • +
+
+
+
+ +
+

Powered by RAG + Endee Vector Database

+
+
+ + + + diff --git a/frontend/script.js b/frontend/script.js new file mode 100644 index 0000000000..1b95712070 --- /dev/null +++ b/frontend/script.js @@ -0,0 +1,154 @@ +// Configuration +const API_BASE_URL = 'http://localhost:8000'; + +// DOM Elements +const questionInput = document.getElementById('question-input'); +const searchBtn = document.getElementById('search-btn'); +const loadingEl = document.getElementById('loading'); +const errorEl = document.getElementById('error'); +const errorMessageEl = document.getElementById('error-message'); +const answerSection = document.getElementById('answer-section'); +const answerContent = document.getElementById('answer-content'); +const sourcesSection = document.getElementById('sources-section'); +const sourcesList = document.getElementById('sources-list'); +const initialState = document.getElementById('initial-state'); + +// Event Listeners +searchBtn.addEventListener('click', handleSearch); +questionInput.addEventListener('keypress', (e) => { + if (e.key === 'Enter') { + handleSearch(); + } +}); + +// Allow clicking example questions +document.querySelectorAll('.example-questions li').forEach(li => { + li.addEventListener('click', () => { + questionInput.value = li.textContent; + handleSearch(); + }); +}); + +async function handleSearch() { + const question = questionInput.value.trim(); + + if (!question) return; + + // Reset states + hideAll(); + showLoading(); + searchBtn.disabled = true; + + try { + // Call /ask endpoint + const response = await fetch(`${API_BASE_URL}/ask`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ + question: question, + top_k: 5 + }) + }); + + if (!response.ok) { + const errorData = await response.json(); + throw new Error(errorData.detail || `HTTP error! status: ${response.status}`); + } + + const data = await response.json(); + + // Display answer and sources + displayAnswer(data.answer); + displaySources(data.sources); + + hideLoading(); + + } catch (error) { + console.error('Error:', error); + hideLoading(); + showError(error.message || 'Failed to get answer. Please try again.'); + } finally { + searchBtn.disabled = false; + questionInput.focus(); + } +} + +function displayAnswer(answer) { + answerContent.textContent = answer; + answerSection.classList.remove('hidden'); +} + +function displaySources(sources) { + if (!sources || sources.length === 0) { + return; + } + + sourcesList.innerHTML = ''; + + sources.forEach((source, index) => { + const card = document.createElement('div'); + card.className = 'source-card'; + + // Create metadata tags + const metaHtml = createMetaTags(source.metadata || {}); + const content = source.text || 'No content available'; + + card.innerHTML = ` + ${metaHtml} +
${escapeHtml(content)}
+ `; + + sourcesList.appendChild(card); + }); + + sourcesSection.classList.remove('hidden'); +} + +function createMetaTags(metadata) { + const tags = []; + + // Add relevant metadata as tags + if (metadata.category) tags.push(metadata.category); + if (metadata.company) tags.push(metadata.company); + if (metadata.topic) tags.push(metadata.topic); + if (metadata.difficulty) tags.push(metadata.difficulty); + + if (tags.length === 0) { + return ''; + } + + const tagsHtml = tags.map(tag => `${escapeHtml(tag)}`).join(''); + return `
${tagsHtml}
`; +} + +function escapeHtml(text) { + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; +} + +function showLoading() { + loadingEl.classList.remove('hidden'); +} + +function hideLoading() { + loadingEl.classList.add('hidden'); +} + +function showError(message) { + errorMessageEl.textContent = message; + errorEl.classList.remove('hidden'); +} + +function hideAll() { + answerSection.classList.add('hidden'); + sourcesSection.classList.add('hidden'); + errorEl.classList.add('hidden'); + loadingEl.classList.add('hidden'); + initialState.classList.add('hidden'); +} + +// Focus input on load +questionInput.focus(); diff --git a/frontend/style.css b/frontend/style.css new file mode 100644 index 0000000000..7ff19a4d66 --- /dev/null +++ b/frontend/style.css @@ -0,0 +1,291 @@ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +body { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif; + background-color: #f5f7fa; + color: #333; + line-height: 1.6; +} + +.container { + max-width: 900px; + margin: 0 auto; + padding: 40px 20px; +} + +/* Header */ +header { + text-align: center; + margin-bottom: 40px; +} + +header h1 { + font-size: 2.5rem; + color: #1a202c; + margin-bottom: 8px; + font-weight: 700; +} + +.subtitle { + font-size: 1.1rem; + color: #718096; +} + +/* Input Section */ +.input-section { + margin-bottom: 30px; +} + +.input-wrapper { + display: flex; + gap: 12px; + background: white; + padding: 12px; + border-radius: 12px; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); +} + +#question-input { + flex: 1; + padding: 12px 16px; + border: 2px solid #e2e8f0; + border-radius: 8px; + font-size: 1rem; + font-family: inherit; + transition: border-color 0.2s; +} + +#question-input:focus { + outline: none; + border-color: #4299e1; +} + +.btn-primary { + padding: 12px 32px; + background-color: #4299e1; + color: white; + border: none; + border-radius: 8px; + font-size: 1rem; + font-weight: 600; + cursor: pointer; + transition: background-color 0.2s; +} + +.btn-primary:hover { + background-color: #3182ce; +} + +.btn-primary:active { + background-color: #2b6cb0; +} + +.btn-primary:disabled { + background-color: #a0aec0; + cursor: not-allowed; +} + +/* Loading State */ +.loading-state { + text-align: center; + padding: 60px 20px; +} + +.spinner { + width: 50px; + height: 50px; + margin: 0 auto 16px; + border: 4px solid #e2e8f0; + border-top-color: #4299e1; + border-radius: 50%; + animation: spin 0.8s linear infinite; +} + +@keyframes spin { + to { transform: rotate(360deg); } +} + +.loading-state p { + color: #718096; + font-size: 1.1rem; +} + +/* Error State */ +.error-state { + background-color: #fed7d7; + border-left: 4px solid #e53e3e; + padding: 20px; + border-radius: 8px; + margin-bottom: 30px; +} + +.error-icon { + font-size: 2rem; + margin-bottom: 8px; +} + +.error-state p { + color: #c53030; + font-size: 1rem; +} + +/* Answer Section */ +.answer-section { + background: white; + padding: 30px; + border-radius: 12px; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); + margin-bottom: 30px; +} + +.answer-section h2 { + font-size: 1.5rem; + color: #1a202c; + margin-bottom: 16px; + font-weight: 600; +} + +.answer-content { + color: #2d3748; + font-size: 1.05rem; + line-height: 1.8; + white-space: pre-wrap; +} + +/* Sources Section */ +.sources-section { + margin-bottom: 30px; +} + +.sources-section h2 { + font-size: 1.5rem; + color: #1a202c; + margin-bottom: 20px; + font-weight: 600; +} + +.sources-list { + display: flex; + flex-direction: column; + gap: 16px; +} + +.source-card { + background: white; + padding: 20px; + border-radius: 8px; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); + border-left: 4px solid #4299e1; +} + +.source-meta { + display: flex; + gap: 8px; + flex-wrap: wrap; + margin-bottom: 12px; +} + +.source-tag { + background-color: #ebf8ff; + color: #2b6cb0; + padding: 4px 12px; + border-radius: 16px; + font-size: 0.85rem; + font-weight: 500; +} + +.source-content { + color: #4a5568; + font-size: 0.95rem; + line-height: 1.7; +} + +/* Initial State */ +.initial-state { + margin-top: 40px; +} + +.welcome-card { + background: white; + padding: 40px; + border-radius: 12px; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); + text-align: center; +} + +.welcome-card h2 { + font-size: 1.8rem; + color: #1a202c; + margin-bottom: 12px; +} + +.welcome-card p { + color: #718096; + margin-bottom: 20px; +} + +.example-questions { + list-style: none; + text-align: left; + max-width: 500px; + margin: 0 auto; +} + +.example-questions li { + padding: 12px 16px; + margin-bottom: 8px; + background-color: #f7fafc; + border-radius: 8px; + color: #4a5568; + cursor: pointer; + transition: all 0.2s; +} + +.example-questions li:hover { + background-color: #ebf8ff; + color: #2b6cb0; + transform: translateX(4px); +} + +/* Footer */ +footer { + text-align: center; + margin-top: 60px; + padding-top: 20px; + border-top: 1px solid #e2e8f0; + color: #a0aec0; + font-size: 0.9rem; +} + +/* Utilities */ +.hidden { + display: none; +} + +/* Responsive */ +@media (max-width: 768px) { + .container { + padding: 20px 15px; + } + + header h1 { + font-size: 2rem; + } + + .input-wrapper { + flex-direction: column; + } + + .btn-primary { + width: 100%; + } + + .answer-section, + .welcome-card { + padding: 20px; + } +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000..939d2aecc6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +fastapi==0.109.0 +uvicorn==0.27.0 +python-dotenv==1.0.0 +httpx==0.26.0 +pydantic==2.8.0 +python-multipart==0.0.6 +sentence-transformers==2.3.0 +numpy>=1.26.0 diff --git a/scripts/ingest_data.py b/scripts/ingest_data.py new file mode 100644 index 0000000000..3461bf0fd4 --- /dev/null +++ b/scripts/ingest_data.py @@ -0,0 +1,40 @@ +""" +Data Ingestion Script +Reads JSON data, chunks text, generates embeddings, and upserts to Endee +""" + +import asyncio +import sys +from pathlib import Path +from dotenv import load_dotenv + +# Add parent directory to path +sys.path.append(str(Path(__file__).parent.parent)) + +from backend.ingest import ingest_data + +load_dotenv() + +async def main(): + """Main ingestion function""" + if len(sys.argv) < 2: + file_path = "data/placement_data.json" + print(f"No file specified, using default: {file_path}") + else: + file_path = sys.argv[1] + + if not Path(file_path).exists(): + print(f"Error: File {file_path} not found") + sys.exit(1) + + try: + result = await ingest_data(file_path) + print(f"\n✓ Ingestion complete: {result}") + except Exception as e: + print(f"Error during ingestion: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/quick_ingest.py b/scripts/quick_ingest.py new file mode 100644 index 0000000000..cb65edee34 --- /dev/null +++ b/scripts/quick_ingest.py @@ -0,0 +1,44 @@ +""" +Quick script to create Endee index and ingest data +Run: python scripts/quick_ingest.py +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from backend.ingest import get_or_create_index, ingest_data +import asyncio + +async def main(): + print("=" * 80) + print("Creating Endee Index & Ingesting Data") + print("=" * 80) + + # Step 1: Create index + print("\n[1/2] Creating/getting index...") + try: + index = get_or_create_index() + print(f"✓ Index ready: {index}") + except Exception as e: + print(f"✗ Failed to create index: {e}") + sys.exit(1) + + # Step 2: Ingest data + print("\n[2/2] Ingesting placement data...") + try: + result = await ingest_data("data/placement_data.json") + print(f"\n✓ SUCCESS!") + print(f" Documents: {result['documents']}") + print(f" Vectors: {result['ingested']}") + print("\n🎉 Endee database is ready! Test with:") + print(" curl -X POST http://localhost:8000/ask -H 'Content-Type: application/json' -d '{\"question\": \"TCS interview\", \"top_k\": 3}'") + except Exception as e: + print(f"\n✗ Failed to ingest: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + asyncio.run(main())