Skip to content

Commit 3bd0ab2

Browse files
committed
=prootype for loading the binary file
1 parent 83bc5a6 commit 3bd0ab2

File tree

1 file changed

+338
-0
lines changed

1 file changed

+338
-0
lines changed

src/proto.ipynb

Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"using PretrainedEmbeddings\n",
10+
"\n",
11+
"using DataDeps"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 21,
17+
"metadata": {},
18+
"outputs": [
19+
{
20+
"data": {
21+
"text/plain": [
22+
"\"FastText fr CommonCrawl Binary/cc.fr.300.bin\""
23+
]
24+
},
25+
"execution_count": 21,
26+
"metadata": {},
27+
"output_type": "execute_result"
28+
}
29+
],
30+
"source": [
31+
"dd_name = language_files(PretrainedEmbeddings.FastText_Bin{:fr}) |> first"
32+
]
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": 22,
37+
"metadata": {},
38+
"outputs": [
39+
{
40+
"data": {
41+
"text/plain": [
42+
"StatStruct(mode=0o100644, size=7238894263)"
43+
]
44+
},
45+
"execution_count": 22,
46+
"metadata": {},
47+
"output_type": "execute_result"
48+
}
49+
],
50+
"source": [
51+
"stat"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"#=\n",
61+
"struct entry {\n",
62+
" std::string word;\n",
63+
" int64_t count;\n",
64+
" entry_type type;\n",
65+
" std::vector<int32_t> subwords;\n",
66+
"};\n",
67+
" #="
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": 24,
73+
"metadata": {},
74+
"outputs": [],
75+
"source": [
76+
"#https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc"
77+
]
78+
},
79+
{
80+
"cell_type": "code",
81+
"execution_count": 20,
82+
"metadata": {},
83+
"outputs": [
84+
{
85+
"data": {
86+
"text/plain": [
87+
"1-element Array{String,1}:\n",
88+
" \"cc.fr.300.bin\""
89+
]
90+
},
91+
"execution_count": 20,
92+
"metadata": {},
93+
"output_type": "execute_result"
94+
}
95+
],
96+
"source": [
97+
"readdir(datadep\"FastText fr CommonCrawl Binary\")"
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": 33,
103+
"metadata": {},
104+
"outputs": [
105+
{
106+
"data": {
107+
"text/plain": [
108+
"Entry"
109+
]
110+
},
111+
"execution_count": 33,
112+
"metadata": {},
113+
"output_type": "execute_result"
114+
}
115+
],
116+
"source": [
117+
"@enum EntryType::Int8 word_type=0 label_type=1\n",
118+
"\n",
119+
"struct Entry\n",
120+
" word::String\n",
121+
" count::Int64\n",
122+
" entry_type:: EntryType\n",
123+
" subwords::Vector{Int32}\n",
124+
"end\n",
125+
"Entry()=Entry(\"\", 0, word_type, Int32[])\n"
126+
]
127+
},
128+
{
129+
"cell_type": "code",
130+
"execution_count": 78,
131+
"metadata": {},
132+
"outputs": [
133+
{
134+
"name": "stdout",
135+
"output_type": "stream",
136+
"text": [
137+
"magic = read(fh, Int32) = 793712314\n",
138+
"version = read(fh, Int32) = 12\n",
139+
"\n",
140+
"args_dim = read(fh, Int32) = 300\n",
141+
"args_ws = read(fh, Int32) = 5\n",
142+
"args_epoch = read(fh, Int32) = 1\n",
143+
"args_minCount = read(fh, Int32) = 5\n",
144+
"args_neg = read(fh, Int32) = 10\n",
145+
"args_wordNgrams = read(fh, Int32) = 1\n",
146+
"args_loss = read(fh, Int32) = 2\n",
147+
"args_model = read(fh, Int32) = 1\n",
148+
"args_bucket = read(fh, Int32) = 2000000\n",
149+
"args_minn = read(fh, Int32) = 5\n",
150+
"args_maxn = read(fh, Int32) = 5\n",
151+
"args_lrUpdateRate = read(fh, Int32) = 100\n",
152+
"args_t = read(fh, Float64) = 9.999999747378752e-6\n",
153+
"\n",
154+
"size_ = read(fh, Int32) = 2000000\n",
155+
"nwords = read(fh, Int32) = 2000000\n",
156+
"nlabels = read(fh, Int32) = 0\n",
157+
"ntokens = read(fh, Int64) = 68358270953\n",
158+
"pruneidx_size_ = read(fh, Int64) = -1\n",
159+
"\n",
160+
"length(words_) = 2000000\n",
161+
"words_[1] = Entry(\",\", 2854010684, word_type::EntryType = 0, Int32[])\n",
162+
"words_[2] = Entry(\"de\", 2742946523, word_type::EntryType = 0, Int32[])\n",
163+
"words_[3] = Entry(\".\", 1675680641, word_type::EntryType = 0, Int32[])\n",
164+
"words_[end - 1] = Entry(\"Fautereau\", 235, word_type::EntryType = 0, Int32[])\n",
165+
"words_[end] = Entry(\"IdealCoque\", 235, word_type::EntryType = 0, Int32[])\n",
166+
"\n",
167+
"\n",
168+
"quant_input = read(fh, Bool) = false\n",
169+
"m_ = read(fh, Int64) = 4000000\n",
170+
"n_ = read(fh, Int64) = 300\n",
171+
"(typeof(data), size(data)) = (Array{Float32,2}, (4000000, 300))\n",
172+
"quant_output = read(fh, Bool) = false\n",
173+
"m_ = read(fh, Int64) = 2000000\n",
174+
"n_ = read(fh, Int64) = 300\n",
175+
"(typeof(data), size(data)) = (Array{Float32,2}, (2000000, 300))\n"
176+
]
177+
}
178+
],
179+
"source": [
180+
"const FASTTEXT_VERSION = Int32(12); # Version 1b \n",
181+
"const FASTTEXT_FILEFORMAT_MAGIC_INT32 = Int32(793712314);\n",
182+
"\n",
183+
"\n",
184+
"function load_header(fh)\n",
185+
"\t### Check Model\n",
186+
" @show magic = read(fh, Int32)\n",
187+
" @assert magic== FASTTEXT_FILEFORMAT_MAGIC_INT32\n",
188+
" @show version = read(fh, Int32)\n",
189+
" @assert version == FASTTEXT_VERSION\n",
190+
" println()\n",
191+
"end\n",
192+
"\n",
193+
"function load_args(fh)\n",
194+
" ## Load Args https://github.com/facebookresearch/fastText/blob/master/src/args.cc#L261\n",
195+
" @show args_dim = read(fh, Int32)\n",
196+
" @show args_ws = read(fh, Int32)\n",
197+
" @show args_epoch = read(fh, Int32)\n",
198+
" @show args_minCount = read(fh, Int32)\n",
199+
" @show args_neg = read(fh, Int32)\n",
200+
" @show args_wordNgrams = read(fh, Int32)\n",
201+
" @show args_loss = read(fh, Int32)\n",
202+
" @show args_model = read(fh, Int32)\n",
203+
" @show args_bucket = read(fh, Int32)\n",
204+
" @show args_minn = read(fh, Int32)\n",
205+
" @show args_maxn = read(fh, Int32)\n",
206+
" @show args_lrUpdateRate = read(fh, Int32)\n",
207+
" @show args_t = read(fh, Float64)\n",
208+
" println()\n",
209+
"end\n",
210+
"\n",
211+
"function load_dict(fh)\n",
212+
" ## Load model dict, https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L419 \n",
213+
" @show size_ = read(fh, Int32)\n",
214+
" @show nwords = read(fh, Int32)\n",
215+
" @show nlabels = read(fh, Int32)\n",
216+
" @show ntokens = read(fh, Int64)\n",
217+
" @show pruneidx_size_ = read(fh, Int64)\n",
218+
" \n",
219+
" println()\n",
220+
" words_ = map(1:size_) do ii\n",
221+
" e_word=readuntil(fh, '\\0')[1:end-1]\n",
222+
" e_count=read(fh, Int64)\n",
223+
" e_entry_type=read(fh, EntryType)\n",
224+
" Entry(e_word, e_count, e_entry_type, Int32[])\n",
225+
" end\n",
226+
" @show length(words_)\n",
227+
" @show words_[1]\n",
228+
" @show words_[2]\n",
229+
" @show words_[3]\n",
230+
" @show words_[end-1]\n",
231+
" @show words_[end]\n",
232+
" println()\n",
233+
" @assert pruneidx_size_ < 0 \n",
234+
" # Avoid loading this stuff https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L437\n",
235+
" println()\n",
236+
"\t\n",
237+
"\twords_\n",
238+
"end\n",
239+
"\n",
240+
"function load_matrix(fh)\n",
241+
" ### Load Matrix\n",
242+
" #https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc#L114\n",
243+
" \n",
244+
" @show m_ = read(fh, Int64)\n",
245+
" @show n_ = read(fh, Int64)\n",
246+
" data = read(fh, Float32, (m_, n_)) # Note `real` is a typedef for `float32`\n",
247+
" @show typeof(data), size(data)\n",
248+
"\tdata\n",
249+
"end\n",
250+
"\n",
251+
"function load_fasttext_bin(filename)\n",
252+
"\topen(filename) do fh\n",
253+
"\t\tload_header(fh)\n",
254+
"\t\tload_args(fh)\n",
255+
"\t\tload_dict(fh)\n",
256+
"\t\t\n",
257+
"\t\t\n",
258+
"\t\t@show quant_input = read(fh, Bool)\n",
259+
"\t\t@assert !quant_input # avoid that stuff\n",
260+
"\t\tinput_ = load_matrix(fh)\n",
261+
"\t\t\n",
262+
"\t\t@show quant_output = read(fh, Bool)\n",
263+
"\t\t@assert !quant_output # avoid that stuff\n",
264+
"\t\toutput_ = load_matrix(fh)\n",
265+
"\t\t\n",
266+
" @assert(eof(fh))\n",
267+
"\tend\n",
268+
"end\n",
269+
"\n",
270+
"\n",
271+
"load_fasttext_bin(@datadep_str dd_name)\n"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": 42,
277+
"metadata": {},
278+
"outputs": [
279+
{
280+
"name": "stdout",
281+
"output_type": "stream",
282+
"text": [
283+
"search: \u001b[1mr\u001b[22m\u001b[1me\u001b[22m\u001b[1ma\u001b[22m\u001b[1md\u001b[22m\u001b[1ms\u001b[22m\u001b[1mt\u001b[22m\u001b[1mr\u001b[22m\u001b[1mi\u001b[22m\u001b[1mn\u001b[22m\u001b[1mg\u001b[22m\n",
284+
"\n"
285+
]
286+
},
287+
{
288+
"data": {
289+
"text/markdown": [
290+
"```\n",
291+
"readstring(stream::IO)\n",
292+
"readstring(filename::AbstractString)\n",
293+
"```\n",
294+
"\n",
295+
"Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n"
296+
],
297+
"text/plain": [
298+
"```\n",
299+
"readstring(stream::IO)\n",
300+
"readstring(filename::AbstractString)\n",
301+
"```\n",
302+
"\n",
303+
"Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n"
304+
]
305+
},
306+
"execution_count": 42,
307+
"metadata": {},
308+
"output_type": "execute_result"
309+
}
310+
],
311+
"source": [
312+
"?readstring"
313+
]
314+
},
315+
{
316+
"cell_type": "code",
317+
"execution_count": null,
318+
"metadata": {},
319+
"outputs": [],
320+
"source": []
321+
}
322+
],
323+
"metadata": {
324+
"kernelspec": {
325+
"display_name": "Julia 0.6.2",
326+
"language": "julia",
327+
"name": "julia-0.6"
328+
},
329+
"language_info": {
330+
"file_extension": ".jl",
331+
"mimetype": "application/julia",
332+
"name": "julia",
333+
"version": "0.6.2"
334+
}
335+
},
336+
"nbformat": 4,
337+
"nbformat_minor": 2
338+
}

0 commit comments

Comments
 (0)