TCP/IP Stack from Scratch
Goal: Build a minimal TCP echo server using raw sockets. Understand how TCP actually works under the hood — the state machine, segment format, retransmission, and flow control. Not using Berkeley sockets — implementing the protocol yourself.
Prerequisites: C sockets basics, understanding of IP/TCP headers, TCP state machine
What Berkeley Sockets Hide
When you call connect(), send(), recv() — the kernel implements TCP. You never see:
- SYN / SYN-ACK / ACK handshake
- Sequence number management
- Retransmission timers
- Flow control (sliding window)
- Congestion control
This tutorial builds a TCP echo server that handles packets directly.
TCP State Machine
Before code, understand the states:
CLOSED → SYN_SENT → ESTABLISHED → FIN_WAIT_1 → FIN_WAIT_2 → TIME_WAIT → CLOSED
↓
LISTEN ← SYN_RECEIVED ←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←
↓
ESTABLISHED
↓
CLOSE_WAIT → LAST_ACK → CLOSED
Key transitions:
LISTEN → SYN_RECEIVED: SYN received, SYN+ACK sentSYN_RECEIVED → ESTABLISHED: ACK receivedESTABLISHED → FIN_WAIT_1: First FIN sentFIN_WAIT_1 → FIN_WAIT_2: ACK of our FIN receivedFIN_WAIT_2 → TIME_WAIT: Their FIN received, ACK sentTIME_WAIT → CLOSED: 2MSL timeout
IP and TCP Header Structures
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
// Ethernet header (14 bytes)
#define ETH_TYPE_IP 0x0800
typedef struct __attribute__((packed)) {
uint8_t dmac[6];
uint8_t smac[6];
uint16_t type;
} EthHdr;
// IPv4 header (20+ bytes)
typedef struct __attribute__((packed)) {
uint8_t ihl:4, // header length (words)
version:4; // version (4)
uint8_t tos; // type of service
uint16_t len; // total length
uint16_t id; // identification
uint16_t flags:3, // fragmentation flags
frag:13; // fragment offset
uint8_t ttl; // time to live
uint8_t proto; // protocol (6 = TCP)
uint16_t csum; // header checksum
uint32_t src; // source IP
uint32_t dst; // dest IP
} IpHdr;
// TCP header (20+ bytes)
typedef struct __attribute__((packed)) {
uint16_t sport; // source port
uint16_t dport; // dest port
uint32_t seq; // sequence number
uint32_t ack; // acknowledgment number
uint8_t off:4, // data offset (words)
:4; // reserved
uint8_t flags; // flags
uint16_t win; // window size
uint16_t csum; // checksum
uint16_t uptr; // urgent pointer
} TcpHdr;
// TCP flags
#define TCP_FIN 0x01
#define TCP_SYN 0x02
#define TCP_RST 0x04
#define TCP_ACK 0x10TCP Segment Checksum
TCP checksum covers pseudo-header + TCP segment:
uint16_t tcp_checksum(IpHdr *ip, TcpHdr *tcp, uint8_t *payload, int payload_len) {
uint32_t sum = 0;
uint8_t pseudo[12];
// Pseudo-header: src IP, dst IP, zeros, proto, TCP length
memcpy(pseudo, &ip->src, 4);
memcpy(pseudo + 4, &ip->dst, 4);
pseudo[8] = 0;
pseudo[9] = ip->proto;
uint16_t tcp_len = ntohs(ip->len) - (ip->ihl * 4);
pseudo[10] = (tcp_len >> 8) & 0xFF;
pseudo[11] = tcp_len & 0xFF;
// Sum pseudo-header
for (int i = 0; i < 6; i++) {
sum += ((uint16_t*)pseudo)[i];
}
// Sum TCP header (even if padded)
int tcp_hdr_len = tcp->off * 4;
int tcp_total = tcp_hdr_len + payload_len;
for (int i = 0; i < tcp_hdr_len / 2; i++) {
sum += ((uint16_t*)tcp)[i];
}
// Pad TCP header if odd
if (tcp_hdr_len % 2) {
sum += ((uint8_t*)tcp)[tcp_hdr_len] << 8;
}
// Sum payload
for (int i = 0; i < payload_len / 2; i++) {
sum += ((uint16_t*)payload)[i];
}
if (payload_len % 2) {
sum += payload[payload_len - 1] << 8;
}
// Fold 32-bit to 16-bit
while (sum >> 16) {
sum = (sum & 0xFFFF) + (sum >> 16);
}
return ~sum;
}Connection State
#include <stdatomic.h>
typedef enum {
TCP_CLOSED,
TCP_LISTEN,
TCP_SYN_RECEIVED,
TCP_ESTABLISHED,
TCP_FIN_WAIT_1,
TCP_FIN_WAIT_2,
TCP_CLOSE_WAIT,
TCP_LAST_ACK,
TCP_TIME_WAIT
} TcpState;
typedef struct {
uint32_t local_ip;
uint16_t local_port;
uint32_t remote_ip;
uint16_t remote_port;
uint32_t snd_nxt; // next sequence to send
uint32_t snd_una; // oldest unacknowledged sequence
uint32_t rcv_nxt; // next expected from peer
uint16_t rcv_wnd; // receive window
TcpState state;
uint8_t *rcv_buf; // receive buffer
size_t rcv_buf_len;
size_t rcv_buf_pos; // data ready in buffer
} TcpConnection;
TcpConnection* create_connection(uint32_t local_ip, uint16_t local_port) {
TcpConnection *conn = calloc(1, sizeof(TcpConnection));
conn->local_ip = local_ip;
conn->local_port = local_port;
conn->state = TCP_CLOSED;
conn->snd_una = 0;
conn->snd_nxt = 1; // Initial seq
conn->rcv_nxt = 0;
conn->rcv_wnd = 65535;
conn->rcv_buf = malloc(65536);
return conn;
}Sending TCP Segments
int send_tcp_segment(int fd, TcpConnection *conn,
uint8_t flags, uint8_t *data, size_t data_len) {
// Allocate space: ETH + IP + TCP + data
size_t pkt_len = 14 + 20 + 20 + data_len;
uint8_t *pkt = malloc(pkt_len);
memset(pkt, 0, pkt_len);
EthHdr *eth = (EthHdr*)pkt;
IpHdr *ip = (IpHdr*)(pkt + 14);
TcpHdr *tcp = (TcpHdr*)(pkt + 14 + 20);
uint8_t *payload = pkt + 14 + 20 + 20;
// TCP header
tcp->sport = conn->local_port;
tcp->dport = conn->remote_port;
tcp->seq = htonl(conn->snd_nxt);
tcp->ack = htonl(conn->rcv_nxt);
tcp->off = 5; // 5 words (20 bytes), no options
tcp->flags = flags;
tcp->win = htons(conn->rcv_wnd);
tcp->csum = 0;
tcp->uptr = 0;
// Copy data
if (data && data_len > 0) {
memcpy(payload, data, data_len);
}
// IP header
ip->version = 4;
ip->ihl = 5;
ip->len = htons(20 + 20 + data_len);
ip->ttl = 64;
ip->proto = 6; // TCP
ip->src = conn->local_ip;
ip->dst = conn->remote_ip;
// Checksums
tcp->csum = htons(tcp_checksum(ip, tcp, payload, data_len));
ip->csum = ip_checksum(ip);
// Send
ssize_t sent = send(fd, pkt, pkt_len, 0);
free(pkt);
// Update state for SYN/FIN
if (flags & TCP_SYN) conn->snd_nxt++;
if (flags & TCP_FIN) conn->snd_nxt++;
if (flags & TCP_ACK) conn->snd_una = conn->snd_nxt;
return sent;
}TCP Echo Server Main Loop
int main() {
int sock = socket(AF_INET, SOCK_RAW, IPPROTO_TCP);
if (sock < 0) { perror("socket"); return 1; }
// Enable header include
int one = 1;
setsockopt(sock, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
// Bind to port 7 (echo)
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(7),
.sin_addr.s_addr = INADDR_ANY
};
bind(sock, (struct sockaddr*)&addr, sizeof(addr));
// Connection table (in production: hash table)
TcpConnection *conns[256] = {0};
uint8_t recv_buf[65536];
while (1) {
ssize_t len = recv(sock, recv_buf, sizeof(recv_buf), 0);
if (len < 0) continue;
EthHdr *eth = (EthHdr*)recv_buf;
IpHdr *ip = (IpHdr*)(recv_buf + 14);
if (ip->proto != 6) continue; // Not TCP
TcpHdr *tcp = (TcpHdr*)(recv_buf + 14 + ip->ihl * 4);
uint8_t *payload = (uint8_t*)tcp + tcp->off * 4;
int payload_len = len - 14 - ip->ihl * 4 - tcp->off * 4;
uint16_t sport = tcp->sport;
uint16_t dport = tcp->dport;
uint32_t seq = ntohl(tcp->seq);
uint32_t ack = ntohl(tcp->ack);
uint8_t flags = tcp->flags;
// Find or create connection
TcpConnection *conn = find_connection(ip->dst, dport, ip->src, sport, conns);
if (!conn && (flags & TCP_SYN) && dport == 7) {
// New connection to echo server
conn = create_connection(ip->dst, dport);
conn->remote_ip = ip->src;
conn->remote_port = sport;
conn->rcv_nxt = seq + 1;
conn->state = TCP_SYN_RECEIVED;
add_connection(conn, conns);
// Send SYN-ACK
send_tcp_segment(sock, conn, TCP_SYN | TCP_ACK, NULL, 0);
}
else if (conn) {
switch (conn->state) {
case TCP_SYN_RECEIVED:
if (flags & TCP_ACK) {
conn->state = TCP_ESTABLISHED;
}
break;
case TCP_ESTABLISHED:
if (payload_len > 0) {
// Echo data back
memcpy(conn->rcv_buf, payload, payload_len);
conn->snd_nxt += payload_len;
send_tcp_segment(sock, conn, TCP_ACK, payload, payload_len);
}
if (flags & TCP_FIN) {
conn->snd_nxt++;
send_tcp_segment(sock, conn, TCP_FIN | TCP_ACK, NULL, 0);
conn->state = TCP_CLOSE_WAIT;
}
break;
}
}
}
return 0;
}Key Concepts This Demonstrates
- Raw sockets bypass the kernel’s TCP implementation
- Sequence numbers must be tracked precisely
- Checksum calculation includes pseudo-header
- State machine governs all packet handling
- Retransmission requires timers (not shown — would need timer management)
What’s Missing (Production TCP)
This is a minimal implementation. Real TCP adds:
- Retransmission timer — resend if ACK not received
- Sliding window — flow control beyond fixed window
- Congestion control — slow start, cwnd
- Options — MSS, window scaling, timestamps, SACK
- Delayed ACKs — wait before acknowledging
- Fast retransmit — 3 duplicate ACKs = retransmit
- Path MTU discovery — avoid fragmentation
Exercises
- Add retransmission: Implement a timer that resends unacknowledged data after timeout
- Handle sliding window: Implement proper flow control where receiver advertises available buffer
- Add RST handling: Respond to reset segments appropriately
- Implement half-close: After echo, close write side but keep reading
See Also
- TCP Protocol — deeper TCP concepts
- Socket Programming — contrast with Berkeley sockets approach
- IP and Routing — IP layer concepts
- RFC 793 — TCP specification
- cperrad/nicer_stack — educational TCP implementation