Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 44 additions & 27 deletions functions/replace-route/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,34 @@
import time
import urllib
import socket
import structlog
import orjson

import botocore
import boto3


slogger = structlog.get_logger()
# use structlog's production-ready, performant example config
# ref: https://www.structlog.org/en/stable/performance.html#example
structlog.configure(
cache_logger_on_first_use=True,
wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.format_exc_info,
structlog.processors.TimeStamper(fmt="iso", utc=True),
structlog.processors.EventRenamer("message"),
structlog.processors.JSONRenderer(serializer=orjson.dumps)
],
logger_factory=structlog.BytesLoggerFactory()
)

# logger is still needed to set the level for dependencies
logger = logging.getLogger()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this line still needed?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The snippet is selecting 4 lines, I gather this may be for the comment.

This relates to this comment regarding third party dependencies. structlog doesn't appear to completely drop-in to replace logger in this specific area. If it's not valuable, I am happy to drop it. My intention here was to describe why logger is still being used while slogger is in place.

I don't believe any of my commits, or this PR actually describes why they're staying, so a git log wouldn't help anyone in that area :(. I don't know if the commit will be a squash or not, but if it's a merge commit, then I can amend the commit message where the surrounding change happened if it helps?

Copy link
Collaborator

@bwhaley bwhaley Feb 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I'm confused about is that logger = logging.getLogger() is still set here but is never used throughout the file. The levels are set using logging.getLogger below. But is the logger object still used in some way? I may be misunderstanding.

logger.setLevel(logging.INFO)
logging.getLogger('boto3').setLevel(logging.CRITICAL)
logging.getLogger('botocore').setLevel(logging.CRITICAL)
Comment on lines 31 to 33
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found this guide with regards to working with the standard library, but I wasn't sure how I could make this applicable here. Otherwise, third party deps will continue using the standard logger unless someone can figure out if wrapping the logger is possible.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My colleague suggests that we could use https://nhairs.github.io/python-json-logger/latest/ as the formatter for these handlers. e.g. something like:

handler = logging.StreamHandler()
handler.setFormatter(JsonFormatter())
logging.getLogger('boto3').addHandler(handler)
logging.getLogger('botocore').addHandler(handler)

Do you think this would work?

Copy link
Author

@therealvio therealvio Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so! This would align with that guide, the section specifically being: "Rendering using logging-based formatters".

I will give it a go and push out a separate commit for this. May take me a bit because it's going to involve tweaking the configs based on what I have now, vs what the guide I am looking at is suggesting.

Copy link
Author

@therealvio therealvio Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An FYI: In the change, handler is a loaded term, since that's the entry point of the function. For now, I have opted for loghandler and we can come up with a better nameas we iterate and review.



ec2_client = boto3.client("ec2")

LIFECYCLE_KEY = "LifecycleHookName"
Expand Down Expand Up @@ -51,18 +68,18 @@ def get_az_and_vpc_zone_identifier(auto_scaling_group):
try:
asg_objects = autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[auto_scaling_group])
except botocore.exceptions.ClientError as error:
logger.error("Unable to describe autoscaling groups")
slogger.error("Unable to describe autoscaling groups")
raise error

if asg_objects["AutoScalingGroups"] and len(asg_objects["AutoScalingGroups"]) > 0:
asg = asg_objects["AutoScalingGroups"][0]
logger.debug("Auto Scaling Group: %s", asg)
slogger.debug("Auto Scaling Group: %s", asg)

availability_zone = asg["AvailabilityZones"][0]
logger.debug("Availability Zone: %s", availability_zone)
slogger.debug("Availability Zone: %s", availability_zone)

vpc_zone_identifier = asg["VPCZoneIdentifier"]
logger.debug("VPC zone identifier: %s", vpc_zone_identifier)
slogger.debug("VPC zone identifier: %s", vpc_zone_identifier)

return availability_zone, vpc_zone_identifier

Expand All @@ -73,18 +90,18 @@ def get_vpc_id(route_table):
try:
route_tables = ec2_client.describe_route_tables(RouteTableIds=[route_table])
except botocore.exceptions.ClientError as error:
logger.error("Unable to get vpc id")
slogger.error("Unable to get vpc id")
raise error
if "RouteTables" in route_tables and len(route_tables["RouteTables"]) == 1:
vpc_id = route_tables["RouteTables"][0]["VpcId"]
logger.debug("VPC ID: %s", vpc_id)
slogger.debug("VPC ID: %s", vpc_id)
return vpc_id


def get_nat_gateway_id(vpc_id, subnet_id):
nat_gateway_id = os.getenv("NAT_GATEWAY_ID")
if nat_gateway_id:
logger.info("Using NAT_GATEWAY_ID env. variable (%s)", nat_gateway_id)
slogger.info("Using NAT_GATEWAY_ID env. variable (%s)", nat_gateway_id)
return nat_gateway_id

try:
Expand All @@ -101,15 +118,15 @@ def get_nat_gateway_id(vpc_id, subnet_id):
]
)
except botocore.exceptions.ClientError as error:
logger.error("Unable to describe nat gateway")
slogger.error("Unable to describe nat gateway")
raise error

logger.debug("NAT Gateways: %s", nat_gateways)
slogger.debug("NAT Gateways: %s", nat_gateways)
if len(nat_gateways.get("NatGateways")) < 1:
raise MissingNatGatewayError(nat_gateways)

nat_gateway_id = nat_gateways['NatGateways'][0]["NatGatewayId"]
logger.debug("NAT Gateway ID: %s", nat_gateway_id)
slogger.debug("NAT Gateway ID: %s", nat_gateway_id)
return nat_gateway_id


Expand All @@ -120,10 +137,10 @@ def replace_route(route_table_id, nat_gateway_id):
"RouteTableId": route_table_id
}
try:
logger.info("Replacing existing route %s for route table %s", route_table_id, new_route_table)
slogger.info("Replacing existing route %s for route table %s", route_table_id, new_route_table)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line prints a dictionary (new_route_table), which would end up nested in json. The message here is sorta broken - "existing route {route_table_id}" is wrong, and then it prints the whole route table.

Let's update the message so it's more accurate and doesn't include a nested dictionary. It will print more cleanly in the output. It does break backward compatibility for folks monitoring for this specific phrasing, which is too bad, but worthwhile to fix. May as well do it now.

Suggested change
slogger.info("Replacing existing route %s for route table %s", route_table_id, new_route_table)
slogger.info("Updating route table %s to use NAT Gateway ID %s", route_table_id, nat_gateway_id)

Copy link
Collaborator

@bwhaley bwhaley Jan 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, most folks should be monitoring for ERROR status or the "Failed connectivity tests! Replacing route" message. This one is INFO.

Copy link
Author

@therealvio therealvio Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries. I will break this out into a separate commit.

Also, most folks should be monitoring for ERROR status or the "Failed connectivity tests! Replacing route" message. This one is INFO.

You'd think it would go without saying. Perhaps in the release we could include that as a sub dot-point? It feels weird having to tell people "how" to monitor things, but mistakes happen.

ec2_client.replace_route(**new_route_table)
except botocore.exceptions.ClientError as error:
logger.error("Unable to replace route")
slogger.error("Unable to replace route")
raise error


Expand All @@ -138,17 +155,17 @@ def check_connection(check_urls):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'alternat/1.0')
urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT)
logger.debug("Successfully connected to %s", url)
slogger.debug("Successfully connected to %s", url)
return True
except urllib.error.HTTPError as error:
logger.warning("Response error from %s: %s, treating as success", url, error)
slogger.warning("Response error from %s: %s, treating as success", url, error)
return True
except urllib.error.URLError as error:
logger.error("error connecting to %s: %s", url, error)
slogger.error("error connecting to %s: %s", url, error)
except socket.timeout as error:
logger.error("timeout error connecting to %s: %s", url, error)
slogger.error("timeout error connecting to %s: %s", url, error)

logger.warning("Failed connectivity tests! Replacing route")
slogger.warning("Failed connectivity tests! Replacing route")

public_subnet_id = os.getenv("PUBLIC_SUBNET_ID")
if not public_subnet_id:
Expand All @@ -163,20 +180,20 @@ def check_connection(check_urls):

for rtb in route_tables:
replace_route(rtb, nat_gateway_id)
logger.info("Route replacement succeeded")
slogger.info("Route replacement succeeded")
return False


def connectivity_test_handler(event, context):
if not isinstance(event, dict):
logger.error(f"Unknown event: {event}")
slogger.error("Unknown event: %s", {event})
return

if event.get("source") != "aws.events":
logger.error(f"Unable to handle unknown event type: {json.dumps(event)}")
slogger.error("Unable to handle unknown event type: %s", json.dumps(event))
raise UnknownEventTypeError

logger.debug("Starting NAT instance connectivity test")
slogger.debug("Starting NAT instance connectivity test")

check_interval = int(os.getenv("CONNECTIVITY_CHECK_INTERVAL", DEFAULT_CONNECTIVITY_CHECK_INTERVAL))
check_urls = "CHECK_URLS" in os.environ and os.getenv("CHECK_URLS").split(",") or DEFAULT_CHECK_URLS
Expand Down Expand Up @@ -209,10 +226,10 @@ def handler(event, _):
if LIFECYCLE_KEY in message and ASG_KEY in message:
asg = message[ASG_KEY]
else:
logger.error("Failed to find lifecycle message to parse")
slogger.error("Failed to find lifecycle message to parse")
raise LifecycleMessageError
except Exception as error:
logger.error("Error: %s", error)
slogger.error(error)
raise error

availability_zone, vpc_zone_identifier = get_az_and_vpc_zone_identifier(asg)
Expand All @@ -227,7 +244,7 @@ def handler(event, _):

for rtb in route_tables:
replace_route(rtb, nat_gateway_id)
logger.info("Route replacement succeeded")
slogger.info("Route replacement succeeded")


class UnknownEventTypeError(Exception): pass
Expand Down
2 changes: 2 additions & 0 deletions functions/replace-route/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
boto3==1.34.90
structlog==24.4.0
orjson==3.10.14
Loading