Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions examples/05.HTTP_SERVER/http_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,11 @@ void __cheri_compartment("http_server_example") example()
if (retries == 0)
{
Debug::log("Failed to close the listening socket.");

// We have to stop now - if we cannot close the
// listening socket, we will not be able to bind onto
// the server port anymore. This is bad!
break;
}
}

Expand Down
52 changes: 21 additions & 31 deletions lib/tcpip/FreeRTOS_IP_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,8 @@ void ip_thread_start(void);
/**
* Flag used to synchronize the network stack thread and user threads at
* startup.
*
* This should not be reset by the error handler and is reset-critical.
*
* Note (which also applies to other reset-critical data): ultimately we should
* move this to a separate "network stack TCB" compartment to be able to reset
* all the state of this compartment unconditionally by re-zeroing the BSS and
* resetting .data from snapshots.
*/
static uint32_t threadEntryGuard;

/**
* Store the thread ID of the TCP/IP thread for use in the error handler.
*/
uint16_t networkThreadID;
uint32_t threadEntryGuard;

/**
* Global lock acquired by the IP thread at startup time. In normal running
Expand Down Expand Up @@ -112,8 +100,6 @@ void __cheri_compartment("TCPIP") ip_thread_entry(void)
{
FreeRTOS_printf(("ip_thread_entry\n"));

networkThreadID = thread_id_get();

while (1)
{
CHERIOT_DURING
Expand All @@ -125,22 +111,7 @@ void __cheri_compartment("TCPIP") ip_thread_entry(void)
futex_wait(&threadEntryGuard, 0);
}

// Reset the guard now: we will only ever re-iterate
// the loop in the case of a network stack reset, in
// which case we want to wait again for a call to
// `ip_thread_start`.
//
// Note that we cannot reset this in `ip_cleanup`,
// because that would overwrite the 1 written by
// `ip_thread_start` if the latter was called before we
// call `ip_cleanup`. This will happen if the IP thread
// crashes, in which case we would first call
// `ip_thread_start` in the error handler, and then
// reset the context to `ip_thread_entry` (itself then
// calling `ip_cleanup`).
threadEntryGuard = 0;

xIPTaskHandle = networkThreadID;
xIPTaskHandle = thread_id_get();
FreeRTOS_printf(
("ip_thread_entry starting, thread ID is %p\n", xIPTaskHandle));

Expand All @@ -149,6 +120,25 @@ void __cheri_compartment("TCPIP") ip_thread_entry(void)
// FreeRTOS event loop. This will only return if a user
// thread crashed.
prvIPTask(NULL);

// Reset the guard. We only want to do this in the case
// of a user thread crash, and we will only ever reach
// this line if a user thread crashes (otherwise we'd
// end up in the error handler).
//
// This is guaranteed not to race with
// `ip_thread_start` since that function will only get
// called after we release the IP thread lock at the
// bottom of this function.
//
// The reason why we only reset this in the case of a
// user thread crash is that the synchronization is
// trivial in the case of a network thread crash: we
// know that we will only enter the next loop iteration
// after the guard has been set to 1 by
// `ip_thread_start`, since the error handler only
// returns once `network_restart` has returned.
threadEntryGuard = 0;
}
CHERIOT_HANDLER
{
Expand Down
52 changes: 46 additions & 6 deletions lib/tcpip/network_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ extern "C" int ipFOREVER(void)
* previous instance of the network stack.
*
* This should not be reset by the error handler and is reset-critical.
*
* Note, however, that only one line in the code ever writes to this variable:
* the error handler. Assuming that the control-flow is not compromised (threat
* model of the reset), this should be impossible to corrupt.
*/
std::atomic<uint32_t> currentSocketEpoch = 0;

Expand Down Expand Up @@ -131,6 +135,17 @@ namespace
-EAGAIN /* return -EAGAIN if we are restarting */);
}

/**
* Length of a locking step for socket locks. When we attempt to grab a
* socket lock, do so in steps of `SocketLockTimeoutStep`. That way, if
* the network stack crashes and we somehow do not have a pointer to
* the socket lock anymore (and thus cannot reset it), we have a
* guarantee that the blocking thread will become live again within a
* bounded amount of time, and bail out as it finds out, through the
* condition function, that we are currently restarting.
*/
static constexpr const Ticks SocketLockTimeoutStep = MS_TO_TICKS(500);

/**
* Wrapper around `with_sealed_socket` that locks the socket before calling
* the operation. This is used by everything except the close operation
Expand All @@ -143,7 +158,24 @@ namespace
{
return with_sealed_socket(
[&](SealedSocket *socket) {
if (LockGuard g{socket->socketLock, timeout})
if (LockGuard g{
socket->socketLock, timeout, SocketLockTimeoutStep, [&]() {
// Return true when we need to bail
// out, in two cases:
// - we are currently restarting
// - the socket is from a previous
// instance of the network stack
//
// We must check the latter: if the
// socket list was corrupted, we may
// not have been able to set this lock
// for destruction, and it may still be
// left in a "locked" state by a thread
// that previously crashed while
// holding the lock.
return restartState != 0 || (socket->socketEpoch !=
currentSocketEpoch.load());
}})
{
return operation(socket);
}
Expand Down Expand Up @@ -773,14 +805,22 @@ int network_socket_close(Timeout *t, SObj mallocCapability, SObj sealedSocket)
}
return with_sealed_socket<true /* this is a close operation */>(
[=](SealedSocket *socket) {
bool socketIsFromPreviousInstance =
(socket->socketEpoch != currentSocketEpoch.load());
// We will fail to lock if the socket is coming from
// a previous instance of the network stack as it set
// for destruction. Ignore the failure: we will not
// call the FreeRTOS API on it anyways.
LockGuard g{socket->socketLock, t};
if (g || (socket->socketEpoch != currentSocketEpoch.load()))
LockGuard g{socket->socketLock, t, SocketLockTimeoutStep, [&]() {
// Return true when we need to bail out,
// in two cases (see comment in
// `with_sealed_socket`).
return restartState != 0 ||
socketIsFromPreviousInstance;
}};
if (g || socketIsFromPreviousInstance)
{
if (socket->socketEpoch != currentSocketEpoch.load())
if (socketIsFromPreviousInstance)
{
Debug::log(
"Destroying a socket from a previous instance of the "
Expand Down Expand Up @@ -809,7 +849,7 @@ int network_socket_close(Timeout *t, SObj mallocCapability, SObj sealedSocket)
// stack (the socket is invalid anyways). Don't close
// the firewall either as this was already done
// during the reset.
if (socket->socketEpoch == currentSocketEpoch.load())
if (!socketIsFromPreviousInstance)
{
auto rawSocket = socket->socket;

Expand Down Expand Up @@ -885,7 +925,7 @@ int network_socket_close(Timeout *t, SObj mallocCapability, SObj sealedSocket)
}
}
int ret = 0;
if (socket->socketEpoch == currentSocketEpoch.load())
if (!socketIsFromPreviousInstance)
{
Debug::Assert(!ds::linked_list::is_singleton(&(socket->ring)),
"The socket should be present in the list.");
Expand Down
9 changes: 7 additions & 2 deletions lib/tcpip/tcpip-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,6 @@ struct SealedSocket
*
* This is used as part of the network stack reset to clean up sockets and
* unblock threads waiting on message queues.
*
* This will be reset by the error handler, however it *is* reset-critical.
*/
extern ds::linked_list::Sentinel<SocketRingLink> sealedSockets;

Expand All @@ -95,6 +93,13 @@ extern std::atomic<uint8_t> restartState;
* performing a network stack reset.
*
* This should not be reset by the error handler and is reset-critical.
*
* Note however that the only code that ever writes to this variable is 1)
* `with_restarting_checks` and `with_restarting_checks_driver` below, and 2)
* the error handler. `with_restarting_checks` and
* `with_restarting_checks_driver` can only be executed when entering the
* compartment. Assuming the control-flow is not compromised (threat model of
* the reset), this should be impossible to corrupt.
*/
extern std::atomic<uint8_t> userThreadCount;

Expand Down
85 changes: 60 additions & 25 deletions lib/tcpip/tcpip_error_handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ extern struct RecursiveMutexState __CriticalSectionFlagLock;
extern struct RecursiveMutexState __SuspendFlagLock;
extern QueueHandle_t xNetworkEventQueue;
extern FlagLockPriorityInherited sealedSocketsListLock;
extern uint32_t threadEntryGuard;

/**
* Restart the network stack. See documentation in `startup.cc`
Expand Down Expand Up @@ -141,6 +142,22 @@ extern "C" void reset_network_stack_state(bool isIpThread)
DebugErrorHandler::log("The network thread crashed while "
"restarting. This may be unrecoverable.");
}
else if (isIpThread)
{
// Reset the IP thread entry guard. We only want to do
// this in the case of a user thread crash, and we will
// only ever reach this line if a user thread crashed
// first (causing the network thread to crash).
//
// This is guaranteed not to race with
// `ip_thread_start` since that function will only get
// called after we return.
//
// For more information, look at the other place where
// we reset `threadEntryGuard` in
// `FreeRTOS_IP_wrapper.c`.
threadEntryGuard = 0;
}

// Another instance of the error handler is running, do not do
// anything.
Expand Down Expand Up @@ -173,45 +190,63 @@ extern "C" void reset_network_stack_state(bool isIpThread)
// ensure that threads waiting on them exit the compartment. We will
// empty the list later.
//
// FIXME This should be made more resilient against corruption of the
// linked list by checking all pointers.
// This is designed to be resilient to a corrupted socket list. If the
// socket list somehow ended up corrupted, we would still manage to
// reset, albeit slower. In the case of socket locks, we would need to
// wait up to 500ms for waiters to wake up, check the network stack
// state machine, and bail out. For event groups, we would need to
// wait 500ms too, until waiters try to re-acquire the lock and crash
// because it was freed through the heap-free-all below.
DebugErrorHandler::log(
"Setting socket locks for destruction and destroying event groups.");
if (!sealedSockets.is_empty())
{
auto *cell = sealedSockets.first();
while (cell != &sealedSockets.sentinel)
{
struct SealedSocket *socket = SealedSocket::from_ring(cell);

FlagLockPriorityInherited *lock = &(socket->socketLock);
if (Capability{lock}.is_valid())
if (!Capability{cell}.is_valid())
{
DebugErrorHandler::log("Destroying socket lock {}.", lock);
lock->upgrade_for_destruction();
}
else
{
DebugErrorHandler::log("Ignoring corrupted socket lock {}.",
lock);
DebugErrorHandler::log(
"Ignoring corrupted cell {} in the socket list.", cell);
// We have to stop here because we cannot
// retrieve the next cell from a corrupted
// cell.
break;
}

FreeRTOS_Socket_t *s = socket->socket;
if (Capability{s}.is_valid() &&
Capability{s->xEventGroup}.is_valid())
struct SealedSocket *socket = SealedSocket::from_ring(cell);

if (Capability{socket}.is_valid())
{
DebugErrorHandler::log("Destroying event group {}.",
s->xEventGroup);
eventgroup_destroy_force(MALLOC_CAPABILITY, s->xEventGroup);
FlagLockPriorityInherited *lock = &(socket->socketLock);
if (Capability{lock}.is_valid())
{
DebugErrorHandler::log("Destroying socket lock {}.", lock);
lock->upgrade_for_destruction();
}
else
{
DebugErrorHandler::log("Ignoring corrupted socket lock {}.",
lock);
}

FreeRTOS_Socket_t *s = socket->socket;
if (Capability{s}.is_valid() &&
Capability{s->xEventGroup}.is_valid())
{
DebugErrorHandler::log("Destroying event group {}.",
s->xEventGroup);
eventgroup_destroy_force(MALLOC_CAPABILITY, s->xEventGroup);
}
else
{
DebugErrorHandler::log("Ignoring corrupted socket {}.", s);
}
}
else
{
// The memory of the event group will still be
// freed later with the `heap_free_all`,
// however we run the risk to have the IP
// thread stuck on the event queue which we
// didn't manage to destroy.
DebugErrorHandler::log("Ignoring corrupted socket {}.", s);
DebugErrorHandler::log(
"Ignoring corrupted socket {} in the socket list.", socket);
}

cell = cell->cell_next();
Expand Down
Loading