Learning VPP: Flow table

logo_fdio-300x184

Overview

The task is to implement stateful machinery to track TCP connections. To achieve this goal, the following functional pieces are required.

  • A bidirectional hash table to match packets against a flow using 5-tuple;
  • A pool with flow entries data structures;
  • A timer wheel to expire stale flows;
  • TCP state machine tracking to clean up closed TCP connections.

Example

One possible implementation could be found here.

1. A bidirectional hash table.

The requirement is to match the packets from one flow to the same flow entry regardless of their direction. Thus by comparing source and destination addresses, it is we can always calculate the same hash as in the snippet below.

if (ip4_address_compare(&ip4->src_address, &ip4->dst_address) < 0)   {          ip4_sig->src = ip4->src_address;
    ip4_sig->dst = ip4->dst_address;
    *is_reverse = 1;
  } else {
    ip4_sig->src = ip4->dst_address;
    ip4_sig->dst = ip4->src_address;
  }

2. A flow table cache.

To speed up flow entry allocation, flow table cache is used. The idea is to batch entries allocation. In other words, to allocate 256 flow entries at once and store their pool indices inside a vector. As soon as a flow entry allocation is required, leverage preallocated data structure.

always_inline void
flow_entry_cache_fill(flowtable_main_t * fm, flowtable_main_per_cpu_t * fmt)
{
    int i;
    flow_entry_t * f;

    if (pthread_spin_lock(&fm->flows_lock) == 0)
    {
        if (PREDICT_FALSE(fm->flows_cpt > fm->flows_max)) {
            pthread_spin_unlock(&fm->flows_lock);
            return;
        }

        for (i = 0; i < FLOW_CACHE_SZ; i++) {                          pool_get_aligned(fm->flows, f, CLIB_CACHE_LINE_BYTES);
            vec_add1(fmt->flow_cache, f - fm->flows);
        }
        fm->flows_cpt += FLOW_CACHE_SZ;

        pthread_spin_unlock(&fm->flows_lock);
    }
}

3. The timer wheel.

To avoid stale flow entries, timers, organized in a so-called timer wheel are used.

static u64
flowtable_timer_expire(flowtable_main_t * fm, flowtable_main_per_cpu_t * fmt,
    u32 now)
{
    u64 expire_cpt;
    flow_entry_t * f;
    u32 * time_slot_curr_index;
    dlist_elt_t * time_slot_curr;
    u32 index;

    time_slot_curr_index = vec_elt_at_index(fmt->timer_wheel, fmt->time_index);

    if (PREDICT_FALSE(dlist_is_empty(fmt->timers, *time_slot_curr_index)))
        return 0;

    expire_cpt = 0;
    time_slot_curr = pool_elt_at_index(fmt->timers, *time_slot_curr_index);

    index = time_slot_curr->next;
    while (index != *time_slot_curr_index && expire_cpt < TIMER_MAX_EXPIRE)     {         dlist_elt_t * e = pool_elt_at_index(fmt->timers, index);
        f = pool_elt_at_index(fm->flows, e->value);

        index = e->next;
        expire_single_flow(fm, fmt, f, e);
        expire_cpt++;
    }

    return expire_cpt;
}

4. Flow entries recycling.

As soon as there are no more available flow entries in the pool, there is a mechanism to recycle the oldest entries.

static void
recycle_flow(flowtable_main_t * fm, flowtable_main_per_cpu_t * fmt, u32 now)
{
    u32 next;

    next = (now + 1) % TIMER_MAX_LIFETIME;
    while (PREDICT_FALSE(next != now))
    {
        flow_entry_t * f;
        u32 * slot_index = vec_elt_at_index(fmt->timer_wheel, next);

        if (PREDICT_FALSE(dlist_is_empty(fmt->timers, *slot_index))) {
            next = (next + 1) % TIMER_MAX_LIFETIME;
            continue;
        }
        dlist_elt_t * head = pool_elt_at_index(fmt->timers, *slot_index);
        dlist_elt_t * e = pool_elt_at_index(fmt->timers, head->next);

        f = pool_elt_at_index(fm->flows, e->value);
        return expire_single_flow(fm, fmt, f, e);
    }

    /*
     * unreachable:
     * this should be called if there is no free flows, so we're bound to have
     * at least *one* flow within the timer wheel (cpu cache is filled at init).
     */
    clib_error("recycle_flow did not find any flow to recycle !");
}

5. Bucket list.

The bidirectional hash algorithm that is used to lookup 5-tuple against a flow entry can result in multiple signatures for different tuples. To overcome this limitation, the list of entries is attached to each hash bucket.

clib_dlist_addhead(fmt->ht_lines, ht_line_head_index, f->ht_index);

6. TCP state machine.

TCP connection states are tracked in order to control flow lifetime.

static const tcp_state_t tcp_trans[TCP_STATE_MAX][TCP_EV_MAX] =
{
    [TCP_STATE_START] = {
        [TCP_EV_SYN]    = TCP_STATE_SYN,
        [TCP_EV_SYNACK] = TCP_STATE_SYNACK,
        [TCP_EV_FIN]    = TCP_STATE_FIN,
        [TCP_EV_FINACK] = TCP_STATE_FINACK,
        [TCP_EV_RST]    = TCP_STATE_RST,
        [TCP_EV_NONE]   = TCP_STATE_ESTABLISHED,
    },
    [TCP_STATE_SYN] = {
        [TCP_EV_SYNACK] = TCP_STATE_SYNACK,
        [TCP_EV_PSHACK] = TCP_STATE_ESTABLISHED,
        [TCP_EV_FIN]    = TCP_STATE_FIN,
        [TCP_EV_FINACK] = TCP_STATE_FINACK,
        [TCP_EV_RST]    = TCP_STATE_RST,
    },
    [TCP_STATE_SYNACK] = {
        [TCP_EV_PSHACK] = TCP_STATE_ESTABLISHED,
        [TCP_EV_FIN]    = TCP_STATE_FIN,
        [TCP_EV_FINACK] = TCP_STATE_FINACK,
        [TCP_EV_RST]    = TCP_STATE_RST,
    },
    [TCP_STATE_ESTABLISHED] = {
        [TCP_EV_FIN]    = TCP_STATE_FIN,
        [TCP_EV_FINACK] = TCP_STATE_FINACK,
        [TCP_EV_RST]    = TCP_STATE_RST,
    },
    [TCP_STATE_FIN] = {
        [TCP_EV_FINACK] = TCP_STATE_FINACK,
        [TCP_EV_RST]    = TCP_STATE_RST,
    },
    [TCP_STATE_FINACK] = {
        [TCP_EV_RST]    = TCP_STATE_RST,
    },
};

7. TCP lifetime.

TCP flow entry has a different expiration time depending on the connection life stage.

static const int tcp_lifetime[TCP_STATE_MAX] =
{
    [TCP_STATE_START]       = 60,
    [TCP_STATE_SYN]         = 15,
    [TCP_STATE_SYNACK]      = 60,
    [TCP_STATE_ESTABLISHED] = 299,
    [TCP_STATE_FIN]         = 15,
    [TCP_STATE_FINACK]      = 3,
    [TCP_STATE_RST]         = 6
};

References

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s