techblog

The api communicator component implements the connection with the openAI api through the use of http requests

"use strict";

const fs = require('fs');
const path = require('path');
const apiCommunicator = require('./components/apiCommunicator');
apiCommunicator.setApi('openai-o3');
const INPUT_CSV = path.join(__dirname, 'wordpress_analysis_results.csv');
const OUTPUT_DIR = path.join(__dirname, 'results_categorization');
const CHUNK_SIZE = 10; // feel free to adjust the number of posts per query

// Create results_categorization directory if it doesn't exist
if (!fs.existsSync(OUTPUT_DIR)) {
  fs.mkdirSync(OUTPUT_DIR);
}

// Read and split the CSV file
const rawCsv = fs.readFileSync(INPUT_CSV, 'utf8');
const allLines = rawCsv.split('\n').filter(line => line.trim() !== '');

if (allLines.length < 2) {
  console.error("CSV file does not contain enough data.");
  process.exit(1);
}

// Parse header to get column indices
const headers = allLines[0].split(',');
const titleIndex = headers.indexOf('Title');
const commentsNormIndex = headers.indexOf('Comments_normalized');

if (titleIndex === -1 || commentsNormIndex === -1) {
  console.error("CSV must contain 'Title' and 'Comments_normalized' columns.");
  process.exit(1);
}

// Build an array of post objects (only Title and Comments_normalized are used)
const posts = [];
for (let i = 1; i < allLines.length; i++) {
  const row = allLines[i].split(',');
  // If the row has an insufficient number of columns, ignore it.
  if (row.length < headers.length) continue;
  const title = row[titleIndex].trim();
  const commentNormalized = row[commentsNormIndex].trim();
  
  // Only include rows with content in the title field
  if (title) {
    posts.push({ title, commentNormalized });
  }
}

// Split posts into chunks
const chunks = [];
for (let i = 0; i < posts.length; i += CHUNK_SIZE) {
  chunks.push(posts.slice(i, i + CHUNK_SIZE));
}

const args = process.argv.slice(2);
let startBatch = 1; // default starting batch is 1
if (args.length > 0) {
  startBatch = parseInt(args[0], 10);
  if (isNaN(startBatch) || startBatch < 1 || startBatch > chunks.length) {
    console.error(`Invalid starting batch number. Please provide a number between 1 and ${chunks.length}.`);
    process.exit(1);
  }
}
const startIndex = startBatch - 1; // convert to 0-indexed

// Function to process each chunk and send a query to the LLM
async function processChunk(chunk, chunkIndex) {
  // Build a simple CSV of posts – header plus one line per post (only Title,Comments_normalized).
  let postCsv = "Title,Comments_normalized\n";
  postCsv += chunk.map(post => `${post.title},${post.commentNormalized}`).join("\n");
  
  // Build the prompt with detailed instructions to produce the four-column CSV result.
  const prompt = `You will be categorizing a list of posts based on their titles. The list will be provided in the following format:

${postCsv}

Your task is to assign a specific category and a generic category to each post based on its title. The output should be in CSV format with the following columns: Title,Comments_normalized,Category,Generic Category.

To determine the categories:
1. Analyze the title of each post to identify its main topic or theme.
2. Assign a specific category that best describes the main topic.
3. Assign a generic category that broadly encompasses the specific category.

Examples of specific categories include:
AI Development - AI technology advances, new AI models, capabilities
AI Applications - Practical uses of AI in various fields
AI Ethics & Society - Social impact, regulations, concerns about AI
Robotics - Robot development, automation
Virtual Reality/AR - VR/AR technology and applications
Cybersecurity - Security, hacking, privacy
Mobile Tech - Smartphones, apps, mobile devices
Transportation Tech - EVs, autonomous vehicles, flying vehicles
Social Media - Platforms, trends, impacts
Gaming - Video games, gaming technology
Business Tech - E-commerce, digital transformation
Healthcare Tech - Medical technology, biotech
Space Tech - Space exploration, satellites
Environmental Tech - Green technology, sustainability
Consumer Electronics - Gadgets, home devices
Digital Infrastructure - Networks, internet, cloud
Cryptocurrency - Blockchain, digital currencies
Quantum Computing - Quantum technology developments

Generic categories include:
-Artificial Intelligence
-Hardware & Devices
-Software & Internet
-Business & Economy
-Society & Ethics
-Science & Research

If you cannot determine a suitable category for a post, use "Uncategorized" for both the specific and generic categories.

Ensure that the processed list of posts is output in the following format and NOTHING MORE:

Title,Comments_normalized,Category,Generic Category
[Post title],[Comments_normalized value],[Specific category],[Generic category]`;

  try {
    console.log(`Processing chunk ${chunkIndex + 1} (with ${chunk.length} posts)...`);
    const response = await apiCommunicator.sendMessage(prompt);
    const outputFile = path.join(OUTPUT_DIR, `result_${chunkIndex + 1}`);
    fs.writeFileSync(outputFile, response, 'utf8');
    console.log(`Chunk ${chunkIndex + 1} processed. Output written to ${outputFile}`);
  } catch (error) {
    console.error(`Error processing chunk ${chunkIndex + 1}:`, error);
  }
}

// Main function to loop through all chunks sequentially
async function main() {
    console.log(`Starting processing at batch ${startBatch} (chunk index ${startIndex}) out of ${chunks.length} total batches.`);
    for (let i = startIndex; i < chunks.length; i++) {
      await processChunk(chunks[i], i);
    }
}

main();

With chunking we were able to derive additional data that increased the performance of our model


Comments

Leave a Reply

Your email address will not be published. Required fields are marked *