TCP/IP Stack from Scratch

Goal: Build a minimal TCP echo server using raw sockets. Understand how TCP actually works under the hood — the state machine, segment format, retransmission, and flow control. Not using Berkeley sockets — implementing the protocol yourself.

Prerequisites: C sockets basics, understanding of IP/TCP headers, TCP state machine


What Berkeley Sockets Hide

When you call connect(), send(), recv() — the kernel implements TCP. You never see:

  • SYN / SYN-ACK / ACK handshake
  • Sequence number management
  • Retransmission timers
  • Flow control (sliding window)
  • Congestion control

This tutorial builds a TCP echo server that handles packets directly.


TCP State Machine

Before code, understand the states:

CLOSED → SYN_SENT → ESTABLISHED → FIN_WAIT_1 → FIN_WAIT_2 → TIME_WAIT → CLOSED
                ↓
           LISTEN ← SYN_RECEIVED ←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←
                         ↓
                    ESTABLISHED
                         ↓
                    CLOSE_WAIT → LAST_ACK → CLOSED

Key transitions:

  • LISTEN → SYN_RECEIVED: SYN received, SYN+ACK sent
  • SYN_RECEIVED → ESTABLISHED: ACK received
  • ESTABLISHED → FIN_WAIT_1: First FIN sent
  • FIN_WAIT_1 → FIN_WAIT_2: ACK of our FIN received
  • FIN_WAIT_2 → TIME_WAIT: Their FIN received, ACK sent
  • TIME_WAIT → CLOSED: 2MSL timeout

IP and TCP Header Structures

#include <stdint.h>
#include <string.h>
#include <stdlib.h>
 
// Ethernet header (14 bytes)
#define ETH_TYPE_IP 0x0800
 
typedef struct __attribute__((packed)) {
    uint8_t  dmac[6];
    uint8_t  smac[6];
    uint16_t type;
} EthHdr;
 
// IPv4 header (20+ bytes)
typedef struct __attribute__((packed)) {
    uint8_t  ihl:4,      // header length (words)
             version:4;   // version (4)
    uint8_t  tos;        // type of service
    uint16_t len;        // total length
    uint16_t id;         // identification
    uint16_t flags:3,    // fragmentation flags
             frag:13;    // fragment offset
    uint8_t  ttl;        // time to live
    uint8_t  proto;      // protocol (6 = TCP)
    uint16_t csum;       // header checksum
    uint32_t src;        // source IP
    uint32_t dst;        // dest IP
} IpHdr;
 
// TCP header (20+ bytes)
typedef struct __attribute__((packed)) {
    uint16_t sport;       // source port
    uint16_t dport;       // dest port
    uint32_t seq;        // sequence number
    uint32_t ack;        // acknowledgment number
    uint8_t  off:4,      // data offset (words)
             :4;         // reserved
    uint8_t  flags;      // flags
    uint16_t win;        // window size
    uint16_t csum;       // checksum
    uint16_t uptr;       // urgent pointer
} TcpHdr;
 
// TCP flags
#define TCP_FIN  0x01
#define TCP_SYN  0x02
#define TCP_RST  0x04
#define TCP_ACK  0x10

TCP Segment Checksum

TCP checksum covers pseudo-header + TCP segment:

uint16_t tcp_checksum(IpHdr *ip, TcpHdr *tcp, uint8_t *payload, int payload_len) {
    uint32_t sum = 0;
    uint8_t pseudo[12];
    
    // Pseudo-header: src IP, dst IP, zeros, proto, TCP length
    memcpy(pseudo, &ip->src, 4);
    memcpy(pseudo + 4, &ip->dst, 4);
    pseudo[8] = 0;
    pseudo[9] = ip->proto;
    uint16_t tcp_len = ntohs(ip->len) - (ip->ihl * 4);
    pseudo[10] = (tcp_len >> 8) & 0xFF;
    pseudo[11] = tcp_len & 0xFF;
    
    // Sum pseudo-header
    for (int i = 0; i < 6; i++) {
        sum += ((uint16_t*)pseudo)[i];
    }
    
    // Sum TCP header (even if padded)
    int tcp_hdr_len = tcp->off * 4;
    int tcp_total = tcp_hdr_len + payload_len;
    
    for (int i = 0; i < tcp_hdr_len / 2; i++) {
        sum += ((uint16_t*)tcp)[i];
    }
    
    // Pad TCP header if odd
    if (tcp_hdr_len % 2) {
        sum += ((uint8_t*)tcp)[tcp_hdr_len] << 8;
    }
    
    // Sum payload
    for (int i = 0; i < payload_len / 2; i++) {
        sum += ((uint16_t*)payload)[i];
    }
    if (payload_len % 2) {
        sum += payload[payload_len - 1] << 8;
    }
    
    // Fold 32-bit to 16-bit
    while (sum >> 16) {
        sum = (sum & 0xFFFF) + (sum >> 16);
    }
    
    return ~sum;
}

Connection State

#include <stdatomic.h>
 
typedef enum {
    TCP_CLOSED,
    TCP_LISTEN,
    TCP_SYN_RECEIVED,
    TCP_ESTABLISHED,
    TCP_FIN_WAIT_1,
    TCP_FIN_WAIT_2,
    TCP_CLOSE_WAIT,
    TCP_LAST_ACK,
    TCP_TIME_WAIT
} TcpState;
 
typedef struct {
    uint32_t local_ip;
    uint16_t local_port;
    uint32_t remote_ip;
    uint16_t remote_port;
    
    uint32_t snd_nxt;     // next sequence to send
    uint32_t snd_una;     // oldest unacknowledged sequence
    uint32_t rcv_nxt;     // next expected from peer
    
    uint16_t rcv_wnd;     // receive window
    
    TcpState state;
    
    uint8_t *rcv_buf;     // receive buffer
    size_t   rcv_buf_len;
    size_t   rcv_buf_pos; // data ready in buffer
} TcpConnection;
 
TcpConnection* create_connection(uint32_t local_ip, uint16_t local_port) {
    TcpConnection *conn = calloc(1, sizeof(TcpConnection));
    conn->local_ip = local_ip;
    conn->local_port = local_port;
    conn->state = TCP_CLOSED;
    conn->snd_una = 0;
    conn->snd_nxt = 1;  // Initial seq
    conn->rcv_nxt = 0;
    conn->rcv_wnd = 65535;
    conn->rcv_buf = malloc(65536);
    return conn;
}

Sending TCP Segments

int send_tcp_segment(int fd, TcpConnection *conn, 
                    uint8_t flags, uint8_t *data, size_t data_len) {
    // Allocate space: ETH + IP + TCP + data
    size_t pkt_len = 14 + 20 + 20 + data_len;
    uint8_t *pkt = malloc(pkt_len);
    memset(pkt, 0, pkt_len);
    
    EthHdr *eth = (EthHdr*)pkt;
    IpHdr  *ip  = (IpHdr*)(pkt + 14);
    TcpHdr *tcp = (TcpHdr*)(pkt + 14 + 20);
    uint8_t *payload = pkt + 14 + 20 + 20;
    
    // TCP header
    tcp->sport = conn->local_port;
    tcp->dport = conn->remote_port;
    tcp->seq = htonl(conn->snd_nxt);
    tcp->ack = htonl(conn->rcv_nxt);
    tcp->off = 5;  // 5 words (20 bytes), no options
    tcp->flags = flags;
    tcp->win = htons(conn->rcv_wnd);
    tcp->csum = 0;
    tcp->uptr = 0;
    
    // Copy data
    if (data && data_len > 0) {
        memcpy(payload, data, data_len);
    }
    
    // IP header
    ip->version = 4;
    ip->ihl = 5;
    ip->len = htons(20 + 20 + data_len);
    ip->ttl = 64;
    ip->proto = 6;  // TCP
    ip->src = conn->local_ip;
    ip->dst = conn->remote_ip;
    
    // Checksums
    tcp->csum = htons(tcp_checksum(ip, tcp, payload, data_len));
    ip->csum = ip_checksum(ip);
    
    // Send
    ssize_t sent = send(fd, pkt, pkt_len, 0);
    free(pkt);
    
    // Update state for SYN/FIN
    if (flags & TCP_SYN) conn->snd_nxt++;
    if (flags & TCP_FIN) conn->snd_nxt++;
    if (flags & TCP_ACK) conn->snd_una = conn->snd_nxt;
    
    return sent;
}

TCP Echo Server Main Loop

int main() {
    int sock = socket(AF_INET, SOCK_RAW, IPPROTO_TCP);
    if (sock < 0) { perror("socket"); return 1; }
    
    // Enable header include
    int one = 1;
    setsockopt(sock, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
    
    // Bind to port 7 (echo)
    struct sockaddr_in addr = {
        .sin_family = AF_INET,
        .sin_port = htons(7),
        .sin_addr.s_addr = INADDR_ANY
    };
    bind(sock, (struct sockaddr*)&addr, sizeof(addr));
    
    // Connection table (in production: hash table)
    TcpConnection *conns[256] = {0};
    
    uint8_t recv_buf[65536];
    
    while (1) {
        ssize_t len = recv(sock, recv_buf, sizeof(recv_buf), 0);
        if (len < 0) continue;
        
        EthHdr *eth = (EthHdr*)recv_buf;
        IpHdr  *ip  = (IpHdr*)(recv_buf + 14);
        
        if (ip->proto != 6) continue;  // Not TCP
        
        TcpHdr *tcp = (TcpHdr*)(recv_buf + 14 + ip->ihl * 4);
        uint8_t *payload = (uint8_t*)tcp + tcp->off * 4;
        int payload_len = len - 14 - ip->ihl * 4 - tcp->off * 4;
        
        uint16_t sport = tcp->sport;
        uint16_t dport = tcp->dport;
        uint32_t seq = ntohl(tcp->seq);
        uint32_t ack = ntohl(tcp->ack);
        uint8_t flags = tcp->flags;
        
        // Find or create connection
        TcpConnection *conn = find_connection(ip->dst, dport, ip->src, sport, conns);
        
        if (!conn && (flags & TCP_SYN) && dport == 7) {
            // New connection to echo server
            conn = create_connection(ip->dst, dport);
            conn->remote_ip = ip->src;
            conn->remote_port = sport;
            conn->rcv_nxt = seq + 1;
            conn->state = TCP_SYN_RECEIVED;
            add_connection(conn, conns);
            
            // Send SYN-ACK
            send_tcp_segment(sock, conn, TCP_SYN | TCP_ACK, NULL, 0);
        }
        else if (conn) {
            switch (conn->state) {
                case TCP_SYN_RECEIVED:
                    if (flags & TCP_ACK) {
                        conn->state = TCP_ESTABLISHED;
                    }
                    break;
                    
                case TCP_ESTABLISHED:
                    if (payload_len > 0) {
                        // Echo data back
                        memcpy(conn->rcv_buf, payload, payload_len);
                        conn->snd_nxt += payload_len;
                        send_tcp_segment(sock, conn, TCP_ACK, payload, payload_len);
                    }
                    if (flags & TCP_FIN) {
                        conn->snd_nxt++;
                        send_tcp_segment(sock, conn, TCP_FIN | TCP_ACK, NULL, 0);
                        conn->state = TCP_CLOSE_WAIT;
                    }
                    break;
            }
        }
    }
    
    return 0;
}

Key Concepts This Demonstrates

  1. Raw sockets bypass the kernel’s TCP implementation
  2. Sequence numbers must be tracked precisely
  3. Checksum calculation includes pseudo-header
  4. State machine governs all packet handling
  5. Retransmission requires timers (not shown — would need timer management)

What’s Missing (Production TCP)

This is a minimal implementation. Real TCP adds:

  • Retransmission timer — resend if ACK not received
  • Sliding window — flow control beyond fixed window
  • Congestion control — slow start, cwnd
  • Options — MSS, window scaling, timestamps, SACK
  • Delayed ACKs — wait before acknowledging
  • Fast retransmit — 3 duplicate ACKs = retransmit
  • Path MTU discovery — avoid fragmentation

Exercises

  1. Add retransmission: Implement a timer that resends unacknowledged data after timeout
  2. Handle sliding window: Implement proper flow control where receiver advertises available buffer
  3. Add RST handling: Respond to reset segments appropriately
  4. Implement half-close: After echo, close write side but keep reading

See Also