Skip to content

Conversation

adayNU
Copy link

@adayNU adayNU commented Aug 11, 2022

This change adds the missing json file to the repo, as well as embeds both the vocab and merge files and uses the embedded FS to create the pre-trained tokenizer.

adayNU added 4 commits August 11, 2022 13:41
This change adds the missing json file to the repo, as well as embeds both the vocab and merge files and uses the embedded FS to create the pre-trained tokenizer.
@zhangjiayin
Copy link

add the file in you project , I fix it with these codes

package xxxxx

import (
	"bufio"
	"embed"
	"encoding/json"
	"fmt"
	"log"
	"regexp"
	"strings"

	"github.com/sugarme/tokenizer"
	"github.com/sugarme/tokenizer/decoder"
	"github.com/sugarme/tokenizer/model"
	"github.com/sugarme/tokenizer/model/bpe"
	"github.com/sugarme/tokenizer/pretokenizer"
	"github.com/sugarme/tokenizer/processor"
	"github.com/sugarme/tokenizer/util"
)

//go:embed encoder.json
//go:embed vocab.bpe
var files embed.FS

func newBpeFromFiles() (*bpe.BPE, error) {
	b := bpe.NewBpeBuilder()
	// b.Files(vocab, merges)
	vocabModel := model.Vocab{}
	bpeMerges := bpe.Merges{}

	vocabBytes, err := files.ReadFile("encoder.json")
	if err != nil {
		return nil, err
	}

	err = json.Unmarshal(vocabBytes, &vocabModel)
	if err != nil {
		return nil, err
	}

	mFile, err := files.Open("vocab.bpe")

	if err != nil {
		return nil, err
	}
	defer mFile.Close()

	s := bufio.NewScanner(mFile)

	// `s.Scan()` advance scaning and return `false` if
	// end of file or hit any error. The error will be
	// access by s.Err. If error caused by EOF it's value is nil.
	var lineNum = 0
	for s.Scan() {
		line := s.Text()

		// Skip line with `#version`
		re := regexp.MustCompile(`#version`)
		if re.MatchString(line) {
			continue
		}

		parts := strings.Split(line, " ")
		if len(parts) != 2 {
			err = fmt.Errorf("Read merge file error: invalid data at line %d\n", lineNum)
			return nil, err
		}

		a, ok := vocabModel[parts[0]]
		if !ok {
			// err = fmt.Errorf("Read merge file error: part a value for '%s' key not found.", parts[0])
			continue
			// return nil, nil, err
		}

		b, ok := vocabModel[parts[1]]
		if !ok {
			// err = fmt.Errorf("Read merge file error: part b value for '%s' key not found.", parts[1])
			continue
			// return nil, nil, err
		}

		pair := bpe.Pair{C1: a, C2: b}
		// newToken := fmt.Sprintf("%v%v", parts[0], parts[1])
		newToken := fmt.Sprintf("%v%v", parts[0], parts[1])
		newId, ok := vocabModel[newToken]
		if !ok {
			err = fmt.Errorf("Read merge file error: key value for token: \"%s\" not found.", newToken)
			return nil, err
		}

		// newTokenInt, err := strconv.ParseInt(newToken, 10, 64)

		err = util.TraceError(err)
		if err != nil {
			return nil, err
		}

		pairVal := bpe.PairVal{Rank: lineNum, NewId: newId}

		bpeMerges[pair] = pairVal

		lineNum += 1
	}

	if s.Err() != nil {
		return nil, s.Err()
	}

	b.VocabAndMerges(vocabModel, bpeMerges)
	return b.Build()
}
func TokenizerEncodeString(t *tokenizer.Tokenizer, prompt string, addSpecialTokens bool) (retVal *tokenizer.Encoding, err error) {
	seq := tokenizer.NewInputSequence(prompt)
	input := tokenizer.NewSingleEncodeInput(seq)
	return t.Encode(input, addSpecialTokens)
}

func NewGPT3Tokenizer(addPrefixSpace bool, trimOffsets bool) *tokenizer.Tokenizer {
	// bpeData, err := files.ReadFile("vocab.bpe")
	// currDir, err := os.Getwd()
	// if err != nil {
	// log.Fatal(err)
	// }
	// util.CdToThis()
	// defer util.CdBack(currDir)

	model, err := newBpeFromFiles()
	if err != nil {
		log.Fatal(err)
	}

	tk := tokenizer.NewTokenizer(model)

	pretok := pretokenizer.NewByteLevel()
	pretok.SetAddPrefixSpace(addPrefixSpace)
	pretok.SetTrimOffsets(trimOffsets)
	tk.WithPreTokenizer(pretok)

	pprocessor := processor.NewByteLevelProcessing(pretok)
	tk.WithPostProcessor(pprocessor)

	bpeDecoder := decoder.NewBpeDecoder("Ġ")
	tk.WithDecoder(bpeDecoder)
	return tk
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants