From add8a659ecf68831f4882c07f5465ca86b8650b7 Mon Sep 17 00:00:00 2001
From: Deepak Pathak <43314890+dpkpathak@users.noreply.github.com>
Date: Wed, 20 Aug 2025 16:23:23 +0200
Subject: [PATCH] Update 2.mdx - Convert Arrow columns to list before
 tokenization

The Transformers tokenizer expects a string or a Python list/sequence of strings. Arrow Column objects (from datasets) are not directly supported and raise ValueError
Wrapping the column with list(...) materializes a list of plain Python strings so the tokenizer accepts them.
---
 chapters/en/chapter3/2.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/chapters/en/chapter3/2.mdx b/chapters/en/chapter3/2.mdx
index bc1b00179..2d9cb2877 100644
--- a/chapters/en/chapter3/2.mdx
+++ b/chapters/en/chapter3/2.mdx
@@ -129,8 +129,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 <Tip>