The api communicator component implements the connection with the openAI api through the use of http requests
"use strict";
const fs = require('fs');
const path = require('path');
const apiCommunicator = require('./components/apiCommunicator');
apiCommunicator.setApi('openai-o3');
const INPUT_CSV = path.join(__dirname, 'wordpress_analysis_results.csv');
const OUTPUT_DIR = path.join(__dirname, 'results_categorization');
const CHUNK_SIZE = 10; // feel free to adjust the number of posts per query
// Create results_categorization directory if it doesn't exist
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR);
}
// Read and split the CSV file
const rawCsv = fs.readFileSync(INPUT_CSV, 'utf8');
const allLines = rawCsv.split('\n').filter(line => line.trim() !== '');
if (allLines.length < 2) {
console.error("CSV file does not contain enough data.");
process.exit(1);
}
// Parse header to get column indices
const headers = allLines[0].split(',');
const titleIndex = headers.indexOf('Title');
const commentsNormIndex = headers.indexOf('Comments_normalized');
if (titleIndex === -1 || commentsNormIndex === -1) {
console.error("CSV must contain 'Title' and 'Comments_normalized' columns.");
process.exit(1);
}
// Build an array of post objects (only Title and Comments_normalized are used)
const posts = [];
for (let i = 1; i < allLines.length; i++) {
const row = allLines[i].split(',');
// If the row has an insufficient number of columns, ignore it.
if (row.length < headers.length) continue;
const title = row[titleIndex].trim();
const commentNormalized = row[commentsNormIndex].trim();
// Only include rows with content in the title field
if (title) {
posts.push({ title, commentNormalized });
}
}
// Split posts into chunks
const chunks = [];
for (let i = 0; i < posts.length; i += CHUNK_SIZE) {
chunks.push(posts.slice(i, i + CHUNK_SIZE));
}
const args = process.argv.slice(2);
let startBatch = 1; // default starting batch is 1
if (args.length > 0) {
startBatch = parseInt(args[0], 10);
if (isNaN(startBatch) || startBatch < 1 || startBatch > chunks.length) {
console.error(`Invalid starting batch number. Please provide a number between 1 and ${chunks.length}.`);
process.exit(1);
}
}
const startIndex = startBatch - 1; // convert to 0-indexed
// Function to process each chunk and send a query to the LLM
async function processChunk(chunk, chunkIndex) {
// Build a simple CSV of posts – header plus one line per post (only Title,Comments_normalized).
let postCsv = "Title,Comments_normalized\n";
postCsv += chunk.map(post => `${post.title},${post.commentNormalized}`).join("\n");
// Build the prompt with detailed instructions to produce the four-column CSV result.
const prompt = `You will be categorizing a list of posts based on their titles. The list will be provided in the following format:
${postCsv}
Your task is to assign a specific category and a generic category to each post based on its title. The output should be in CSV format with the following columns: Title,Comments_normalized,Category,Generic Category.
To determine the categories:
1. Analyze the title of each post to identify its main topic or theme.
2. Assign a specific category that best describes the main topic.
3. Assign a generic category that broadly encompasses the specific category.
Examples of specific categories include:
AI Development - AI technology advances, new AI models, capabilities
AI Applications - Practical uses of AI in various fields
AI Ethics & Society - Social impact, regulations, concerns about AI
Robotics - Robot development, automation
Virtual Reality/AR - VR/AR technology and applications
Cybersecurity - Security, hacking, privacy
Mobile Tech - Smartphones, apps, mobile devices
Transportation Tech - EVs, autonomous vehicles, flying vehicles
Social Media - Platforms, trends, impacts
Gaming - Video games, gaming technology
Business Tech - E-commerce, digital transformation
Healthcare Tech - Medical technology, biotech
Space Tech - Space exploration, satellites
Environmental Tech - Green technology, sustainability
Consumer Electronics - Gadgets, home devices
Digital Infrastructure - Networks, internet, cloud
Cryptocurrency - Blockchain, digital currencies
Quantum Computing - Quantum technology developments
Generic categories include:
-Artificial Intelligence
-Hardware & Devices
-Software & Internet
-Business & Economy
-Society & Ethics
-Science & Research
If you cannot determine a suitable category for a post, use "Uncategorized" for both the specific and generic categories.
Ensure that the processed list of posts is output in the following format and NOTHING MORE:
Title,Comments_normalized,Category,Generic Category
[Post title],[Comments_normalized value],[Specific category],[Generic category]`;
try {
console.log(`Processing chunk ${chunkIndex + 1} (with ${chunk.length} posts)...`);
const response = await apiCommunicator.sendMessage(prompt);
const outputFile = path.join(OUTPUT_DIR, `result_${chunkIndex + 1}`);
fs.writeFileSync(outputFile, response, 'utf8');
console.log(`Chunk ${chunkIndex + 1} processed. Output written to ${outputFile}`);
} catch (error) {
console.error(`Error processing chunk ${chunkIndex + 1}:`, error);
}
}
// Main function to loop through all chunks sequentially
async function main() {
console.log(`Starting processing at batch ${startBatch} (chunk index ${startIndex}) out of ${chunks.length} total batches.`);
for (let i = startIndex; i < chunks.length; i++) {
await processChunk(chunks[i], i);
}
}
main();
With chunking we were able to derive additional data that increased the performance of our model
Leave a Reply