Back to Blog
Guide

Automate Social Media Data Collection: Complete Guide

January 5, 2026
8 min read
S
By SociaVault Team
AutomationData CollectionSocial MediaPipelinesETL

Automate Social Media Data Collection

Manual data collection doesn't scale.

You need automation—systems that run on schedule, handle errors, and deliver clean data without babysitting.

This guide covers everything from simple cron jobs to production-grade pipelines.

What We're Building

An automated system that:

  • Collects social media data on schedule
  • Handles failures gracefully
  • Stores data efficiently
  • Alerts you when something's wrong
  • Scales to thousands of profiles

Architecture Options

Simple: Cron + Script

Best for: Small projects, personal use

┌─────────────┐     ┌─────────────┐     ┌─────────────┐
│   Cron Job  │ --> │   Script    │ --> │   Database  │
└─────────────┘     └─────────────┘     └─────────────┘

Medium: Task Queue

Best for: Growing projects, multiple data sources

┌─────────────┐     ┌─────────────┐     ┌─────────────┐
│  Scheduler  │ --> │   Queue     │ --> │   Workers   │
└─────────────┘     │  (Redis)    │     │  (Multiple) │
                    └─────────────┘     └─────────────┘
                                              v
                                        ┌─────────────┐
                                        │   Database  │
                                        └─────────────┘

Enterprise: Orchestrated Pipelines

Best for: Large scale, complex workflows

Airflow/Dagster → Kubernetes Jobs → API Calls → Data Lake → Transform → Warehouse

Simple Automation: Node.js + Cron

Basic Script

// collect.js
require('dotenv').config();
const Database = require('better-sqlite3');

const API_KEY = process.env.SOCIAVAULT_API_KEY;
const API_BASE = 'https://api.sociavault.com/v1/scrape';

const db = new Database('social_data.db');

// Initialize database
db.exec(`
  CREATE TABLE IF NOT EXISTS profiles (
    id INTEGER PRIMARY KEY,
    platform TEXT,
    username TEXT,
    followers INTEGER,
    following INTEGER,
    posts INTEGER,
    engagement_rate REAL,
    collected_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(platform, username, DATE(collected_at))
  );
`);

async function fetchProfile(platform, username) {
  const response = await fetch(
    `${API_BASE}/${platform}/profile?username=${username}`,
    { headers: { 'Authorization': `Bearer ${API_KEY}` } }
  );
  
  if (!response.ok) {
    throw new Error(`API error: ${response.status}`);
  }
  
  return response.json();
}

async function collectData() {
  const accounts = [
    { platform: 'tiktok', username: 'charlidamelio' },
    { platform: 'tiktok', username: 'khaby.lame' },
    { platform: 'instagram', username: 'natgeo' },
    { platform: 'youtube', username: 'MrBeast' }
  ];
  
  const insert = db.prepare(`
    INSERT OR REPLACE INTO profiles 
    (platform, username, followers, following, posts, engagement_rate)
    VALUES (?, ?, ?, ?, ?, ?)
  `);
  
  for (const account of accounts) {
    try {
      console.log(`Fetching ${account.platform}/${account.username}...`);
      
      const result = await fetchProfile(account.platform, account.username);
      const data = result.data;
      
      insert.run(
        account.platform,
        account.username,
        data.follower_count || data.followers,
        data.following_count || data.following,
        data.video_count || data.posts_count || 0,
        data.engagement_rate || null
      );
      
      console.log(`${data.follower_count || data.followers} followers`);
      
      // Rate limiting
      await new Promise(r => setTimeout(r, 500));
      
    } catch (error) {
      console.error(`  ✗ Error: ${error.message}`);
    }
  }
  
  console.log('Collection complete!');
}

collectData().catch(console.error);

Schedule with Cron

# Run every hour
0 * * * * cd /path/to/project && node collect.js >> logs/collect.log 2>&1

# Run every day at 6 AM
0 6 * * * cd /path/to/project && node collect.js >> logs/collect.log 2>&1

Or use node-cron for in-process scheduling:

const cron = require('node-cron');

// Run every hour
cron.schedule('0 * * * *', () => {
  console.log('Starting scheduled collection...');
  collectData().catch(console.error);
});

console.log('Scheduler running...');

Medium Scale: Queue-Based System

With BullMQ (Redis)

// worker.js
const { Worker, Queue } = require('bullmq');
const Redis = require('ioredis');

const connection = new Redis(process.env.REDIS_URL);

// Create queue
const profileQueue = new Queue('profiles', { connection });

// Worker to process jobs
const worker = new Worker('profiles', async (job) => {
  const { platform, username } = job.data;
  
  console.log(`Processing ${platform}/${username}`);
  
  const result = await fetchProfile(platform, username);
  await saveToDatabase(result.data);
  
  return { success: true, followers: result.data.follower_count };
}, {
  connection,
  concurrency: 5, // Process 5 at a time
  limiter: {
    max: 10,
    duration: 1000 // Max 10 jobs per second
  }
});

worker.on('completed', (job, result) => {
  console.log(`${job.data.username}: ${result.followers} followers`);
});

worker.on('failed', (job, error) => {
  console.error(`${job.data.username}: ${error.message}`);
});
// scheduler.js
const { Queue } = require('bullmq');
const Redis = require('ioredis');

const connection = new Redis(process.env.REDIS_URL);
const profileQueue = new Queue('profiles', { connection });

async function scheduleCollection() {
  const accounts = await db.all('SELECT * FROM tracked_accounts WHERE active = 1');
  
  for (const account of accounts) {
    await profileQueue.add('collect', {
      platform: account.platform,
      username: account.username
    }, {
      attempts: 3,
      backoff: {
        type: 'exponential',
        delay: 2000
      }
    });
  }
  
  console.log(`Scheduled ${accounts.length} jobs`);
}

// Run on schedule
const cron = require('node-cron');
cron.schedule('0 */6 * * *', scheduleCollection); // Every 6 hours

Python Alternative

# collector.py
import os
import sqlite3
import requests
import schedule
import time
from datetime import datetime

API_KEY = os.getenv('SOCIAVAULT_API_KEY')
API_BASE = 'https://api.sociavault.com/v1/scrape'

def get_db():
    conn = sqlite3.connect('social_data.db')
    conn.execute('''
        CREATE TABLE IF NOT EXISTS profiles (
            id INTEGER PRIMARY KEY,
            platform TEXT,
            username TEXT,
            followers INTEGER,
            collected_at TEXT
        )
    ''')
    return conn

def fetch_profile(platform, username):
    response = requests.get(
        f'{API_BASE}/{platform}/profile',
        params={'username': username},
        headers={'Authorization': f'Bearer {API_KEY}'}
    )
    response.raise_for_status()
    return response.json()

def collect_all():
    accounts = [
        ('tiktok', 'charlidamelio'),
        ('instagram', 'natgeo'),
        ('youtube', 'MrBeast')
    ]
    
    conn = get_db()
    
    for platform, username in accounts:
        try:
            result = fetch_profile(platform, username)
            data = result['data']
            
            conn.execute(
                'INSERT INTO profiles (platform, username, followers, collected_at) VALUES (?, ?, ?, ?)',
                (platform, username, data.get('follower_count') or data.get('followers'), datetime.now().isoformat())
            )
            conn.commit()
            
            print(f'✓ {platform}/{username}: {data.get("follower_count") or data.get("followers")}')
            time.sleep(0.5)
            
        except Exception as e:
            print(f'✗ {platform}/{username}: {e}')
    
    conn.close()

# Schedule
schedule.every().hour.do(collect_all)
schedule.every().day.at('06:00').do(collect_all)

print('Scheduler running...')
while True:
    schedule.run_pending()
    time.sleep(60)

No-Code: n8n Workflow

For those who prefer visual workflows:

{
  "name": "Social Media Collector",
  "nodes": [
    {
      "name": "Schedule Trigger",
      "type": "n8n-nodes-base.scheduleTrigger",
      "parameters": {
        "rule": {
          "interval": [{"field": "hours", "interval": 6}]
        }
      }
    },
    {
      "name": "Get Accounts",
      "type": "n8n-nodes-base.postgres",
      "parameters": {
        "operation": "executeQuery",
        "query": "SELECT platform, username FROM tracked_accounts WHERE active = true"
      }
    },
    {
      "name": "Loop Over Accounts",
      "type": "n8n-nodes-base.splitInBatches",
      "parameters": {
        "batchSize": 1
      }
    },
    {
      "name": "Fetch Profile",
      "type": "n8n-nodes-base.httpRequest",
      "parameters": {
        "url": "=https://api.sociavault.com/v1/scrape/{{ $json.platform }}/profile",
        "qs": {"username": "={{ $json.username }}"},
        "headers": {"Authorization": "Bearer {{ $env.SOCIAVAULT_API_KEY }}"}
      }
    },
    {
      "name": "Save to Database",
      "type": "n8n-nodes-base.postgres",
      "parameters": {
        "operation": "insert",
        "table": "profile_snapshots"
      }
    }
  ]
}

Error Handling & Alerts

Retry Logic

async function fetchWithRetry(fn, maxAttempts = 3) {
  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
    try {
      return await fn();
    } catch (error) {
      if (attempt === maxAttempts) throw error;
      
      const delay = Math.pow(2, attempt) * 1000; // Exponential backoff
      console.log(`Attempt ${attempt} failed, retrying in ${delay}ms...`);
      await new Promise(r => setTimeout(r, delay));
    }
  }
}

// Usage
const data = await fetchWithRetry(() => 
  fetchProfile('tiktok', 'username')
);

Slack Alerts

async function sendAlert(message) {
  await fetch(process.env.SLACK_WEBHOOK_URL, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ text: message })
  });
}

// In your collector
try {
  await collectData();
} catch (error) {
  await sendAlert(`⚠️ Collection failed: ${error.message}`);
}

Health Checks

// health.js
async function checkHealth() {
  const checks = {
    database: false,
    api: false,
    redis: false
  };
  
  // Check database
  try {
    db.prepare('SELECT 1').get();
    checks.database = true;
  } catch (e) {}
  
  // Check API
  try {
    const response = await fetch(`${API_BASE}/health`, {
      headers: { 'Authorization': `Bearer ${API_KEY}` }
    });
    checks.api = response.ok;
  } catch (e) {}
  
  // Check Redis
  try {
    await redis.ping();
    checks.redis = true;
  } catch (e) {}
  
  return checks;
}

Scaling Tips

1. Batch Your Requests

// Instead of single requests
for (const username of usernames) {
  await fetchProfile(username);
}

// Use batching if API supports it
const results = await fetch(`${API_BASE}/batch`, {
  method: 'POST',
  headers: { 
    'Authorization': `Bearer ${API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    requests: usernames.map(u => ({
      endpoint: '/tiktok/profile',
      params: { username: u }
    }))
  })
}).then(r => r.json());

2. Cache Intelligently

const NodeCache = require('node-cache');
const cache = new NodeCache({ stdTTL: 3600 }); // 1 hour

async function getCachedProfile(platform, username) {
  const key = `${platform}:${username}`;
  
  let data = cache.get(key);
  if (data) return data;
  
  data = await fetchProfile(platform, username);
  cache.set(key, data);
  
  return data;
}

3. Use Connection Pooling

const { Pool } = require('pg');

const pool = new Pool({
  connectionString: process.env.DATABASE_URL,
  max: 20,
  idleTimeoutMillis: 30000
});

async function saveProfile(data) {
  const client = await pool.connect();
  try {
    await client.query(
      'INSERT INTO profiles (...) VALUES (...)',
      [...]
    );
  } finally {
    client.release();
  }
}

Monitoring Dashboard

Track your collection health:

// metrics.js
const prometheus = require('prom-client');

const collectionsTotal = new prometheus.Counter({
  name: 'social_collections_total',
  help: 'Total number of profile collections',
  labelNames: ['platform', 'status']
});

const collectionDuration = new prometheus.Histogram({
  name: 'social_collection_duration_seconds',
  help: 'Duration of collection operations',
  labelNames: ['platform']
});

// In collector
const timer = collectionDuration.startTimer({ platform });
try {
  await fetchProfile(platform, username);
  collectionsTotal.inc({ platform, status: 'success' });
} catch (error) {
  collectionsTotal.inc({ platform, status: 'error' });
} finally {
  timer();
}

Ready to automate your data collection?

Get your API key at sociavault.com with 50 free credits.


Related:

Found this helpful?

Share it with others who might benefit

Ready to Try SociaVault?

Start extracting social media data with our powerful API. No credit card required.