Job-assistant/parser.js at main · uyenbui2930/Job-assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// utils/parser.js
const fs = require('fs');
const util = require('util');
const pdf = require('pdf-parse'); // For PDF
const mammoth = require('mammoth'); // For DOCX

// Convert fs.readFile into a promise-based function
const readFile = util.promisify(fs.readFile);

/**
 * Extracts raw text from a DOCX file.
 * @param {string} filePath - Path to the .docx file.
 * @returns {Promise<string>} The extracted text.
 */
async function parseDocx(filePath) {
    const result = await mammoth.extractRawText({ path: filePath });
    return result.value;
}

/**
 * Extracts raw text from a PDF file.
 * @param {string} filePath - Path to the .pdf file.
 * @returns {Promise<string>} The extracted text.
 */
async function parsePdf(filePath) {
    const dataBuffer = await readFile(filePath);
    const data = await pdf(dataBuffer);
    return data.text;
}

/**
 * Main function to parse a resume file based on its extension.
 * @param {string} filePath - Path to the resume file.
 * @returns {Promise<string>} The extracted text, or null if unsupported.
 */
async function parseResume(filePath) {
    const extension = filePath.split('.').pop().toLowerCase();

    try {
        if (extension === 'docx') {
            console.log(`Parsing DOCX: ${filePath}`);
            return await parseDocx(filePath);
        } else if (extension === 'pdf') {
            console.log(`Parsing PDF: ${filePath}`);
            return await parsePdf(filePath);
        } else {
            console.warn(`Unsupported file type: ${extension}`);
            return null;
        }
    } catch (error) {
        console.error(`Error during file parsing for ${filePath}:`, error.message);
        throw new Error(`Failed to parse file: ${filePath}`);
    }
}

module.exports = { parseResume };