=prootype for loading the binary file

oxinabox · oxinabox · commit 3bd0ab2a1cf4 · 2018-06-08T13:19:07.000+08:00
diff --git a/src/proto.ipynb b/src/proto.ipynb
@@ -0,0 +1,338 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using PretrainedEmbeddings\n",
+    "\n",
+    "using DataDeps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"FastText fr CommonCrawl Binary/cc.fr.300.bin\""
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dd_name = language_files(PretrainedEmbeddings.FastText_Bin{:fr}) |> first"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "StatStruct(mode=0o100644, size=7238894263)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#=\n",
+    "struct entry {\n",
+    "  std::string word;\n",
+    "  int64_t count;\n",
+    "  entry_type type;\n",
+    "  std::vector<int32_t> subwords;\n",
+    "};\n",
+    "        #="
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1-element Array{String,1}:\n",
+       " \"cc.fr.300.bin\""
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "readdir(datadep\"FastText fr CommonCrawl Binary\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Entry"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@enum EntryType::Int8 word_type=0 label_type=1\n",
+    "\n",
+    "struct Entry\n",
+    "    word::String\n",
+    "    count::Int64\n",
+    "    entry_type:: EntryType\n",
+    "    subwords::Vector{Int32}\n",
+    "end\n",
+    "Entry()=Entry(\"\", 0, word_type, Int32[])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "magic = read(fh, Int32) = 793712314\n",
+      "version = read(fh, Int32) = 12\n",
+      "\n",
+      "args_dim = read(fh, Int32) = 300\n",
+      "args_ws = read(fh, Int32) = 5\n",
+      "args_epoch = read(fh, Int32) = 1\n",
+      "args_minCount = read(fh, Int32) = 5\n",
+      "args_neg = read(fh, Int32) = 10\n",
+      "args_wordNgrams = read(fh, Int32) = 1\n",
+      "args_loss = read(fh, Int32) = 2\n",
+      "args_model = read(fh, Int32) = 1\n",
+      "args_bucket = read(fh, Int32) = 2000000\n",
+      "args_minn = read(fh, Int32) = 5\n",
+      "args_maxn = read(fh, Int32) = 5\n",
+      "args_lrUpdateRate = read(fh, Int32) = 100\n",
+      "args_t = read(fh, Float64) = 9.999999747378752e-6\n",
+      "\n",
+      "size_ = read(fh, Int32) = 2000000\n",
+      "nwords = read(fh, Int32) = 2000000\n",
+      "nlabels = read(fh, Int32) = 0\n",
+      "ntokens = read(fh, Int64) = 68358270953\n",
+      "pruneidx_size_ = read(fh, Int64) = -1\n",
+      "\n",
+      "length(words_) = 2000000\n",
+      "words_[1] = Entry(\",\", 2854010684, word_type::EntryType = 0, Int32[])\n",
+      "words_[2] = Entry(\"de\", 2742946523, word_type::EntryType = 0, Int32[])\n",
+      "words_[3] = Entry(\".\", 1675680641, word_type::EntryType = 0, Int32[])\n",
+      "words_[end - 1] = Entry(\"Fautereau\", 235, word_type::EntryType = 0, Int32[])\n",
+      "words_[end] = Entry(\"IdealCoque\", 235, word_type::EntryType = 0, Int32[])\n",
+      "\n",
+      "\n",
+      "quant_input = read(fh, Bool) = false\n",
+      "m_ = read(fh, Int64) = 4000000\n",
+      "n_ = read(fh, Int64) = 300\n",
+      "(typeof(data), size(data)) = (Array{Float32,2}, (4000000, 300))\n",
+      "quant_output = read(fh, Bool) = false\n",
+      "m_ = read(fh, Int64) = 2000000\n",
+      "n_ = read(fh, Int64) = 300\n",
+      "(typeof(data), size(data)) = (Array{Float32,2}, (2000000, 300))\n"
+     ]
+    }
+   ],
+   "source": [
+    "const FASTTEXT_VERSION = Int32(12); # Version 1b \n",
+    "const FASTTEXT_FILEFORMAT_MAGIC_INT32 = Int32(793712314);\n",
+    "\n",
+    "\n",
+    "function load_header(fh)\n",
+    "\t### Check Model\n",
+    "    @show magic = read(fh, Int32)\n",
+    "    @assert magic== FASTTEXT_FILEFORMAT_MAGIC_INT32\n",
+    "    @show version = read(fh, Int32)\n",
+    "    @assert version == FASTTEXT_VERSION\n",
+    "    println()\n",
+    "end\n",
+    "\n",
+    "function load_args(fh)\n",
+    "    ## Load Args https://github.com/facebookresearch/fastText/blob/master/src/args.cc#L261\n",
+    "    @show args_dim = read(fh, Int32)\n",
+    "    @show args_ws = read(fh, Int32)\n",
+    "    @show args_epoch = read(fh, Int32)\n",
+    "    @show args_minCount = read(fh, Int32)\n",
+    "    @show args_neg = read(fh, Int32)\n",
+    "    @show args_wordNgrams = read(fh, Int32)\n",
+    "    @show args_loss = read(fh, Int32)\n",
+    "    @show args_model = read(fh, Int32)\n",
+    "    @show args_bucket = read(fh, Int32)\n",
+    "    @show args_minn = read(fh, Int32)\n",
+    "    @show args_maxn = read(fh, Int32)\n",
+    "    @show args_lrUpdateRate = read(fh, Int32)\n",
+    "    @show args_t = read(fh, Float64)\n",
+    "    println()\n",
+    "end\n",
+    "\n",
+    "function load_dict(fh)\n",
+    "    ## Load model dict, https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L419   \n",
+    "    @show size_ = read(fh, Int32)\n",
+    "    @show nwords = read(fh, Int32)\n",
+    "    @show nlabels = read(fh, Int32)\n",
+    "    @show ntokens = read(fh, Int64)\n",
+    "    @show pruneidx_size_ = read(fh, Int64)\n",
+    "    \n",
+    "    println()\n",
+    "    words_ = map(1:size_) do ii\n",
+    "        e_word=readuntil(fh, '\\0')[1:end-1]\n",
+    "        e_count=read(fh, Int64)\n",
+    "        e_entry_type=read(fh, EntryType)\n",
+    "        Entry(e_word, e_count, e_entry_type, Int32[])\n",
+    "    end\n",
+    "    @show length(words_)\n",
+    "    @show words_[1]\n",
+    "    @show words_[2]\n",
+    "    @show words_[3]\n",
+    "    @show words_[end-1]\n",
+    "    @show words_[end]\n",
+    "    println()\n",
+    "    @assert pruneidx_size_ < 0 \n",
+    "    # Avoid loading this stuff https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L437\n",
+    "    println()\n",
+    "\t\n",
+    "\twords_\n",
+    "end\n",
+    "\n",
+    "function load_matrix(fh)\n",
+    "    ### Load Matrix\n",
+    "    #https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc#L114\n",
+    "    \n",
+    "    @show m_ = read(fh, Int64)\n",
+    "    @show n_ = read(fh, Int64)\n",
+    "    data = read(fh, Float32, (m_, n_)) # Note `real` is a typedef for `float32`\n",
+    "    @show typeof(data), size(data)\n",
+    "\tdata\n",
+    "end\n",
+    "\n",
+    "function load_fasttext_bin(filename)\n",
+    "\topen(filename) do fh\n",
+    "\t\tload_header(fh)\n",
+    "\t\tload_args(fh)\n",
+    "\t\tload_dict(fh)\n",
+    "\t\t\n",
+    "\t\t\n",
+    "\t\t@show quant_input = read(fh, Bool)\n",
+    "\t\t@assert !quant_input # avoid that stuff\n",
+    "\t\tinput_ = load_matrix(fh)\n",
+    "\t\t\n",
+    "\t\t@show quant_output = read(fh, Bool)\n",
+    "\t\t@assert !quant_output # avoid that stuff\n",
+    "\t\toutput_ = load_matrix(fh)\n",
+    "\t\t\n",
+    "        @assert(eof(fh))\n",
+    "\tend\n",
+    "end\n",
+    "\n",
+    "\n",
+    "load_fasttext_bin(@datadep_str dd_name)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "search: \u001b[1mr\u001b[22m\u001b[1me\u001b[22m\u001b[1ma\u001b[22m\u001b[1md\u001b[22m\u001b[1ms\u001b[22m\u001b[1mt\u001b[22m\u001b[1mr\u001b[22m\u001b[1mi\u001b[22m\u001b[1mn\u001b[22m\u001b[1mg\u001b[22m\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "```\n",
+       "readstring(stream::IO)\n",
+       "readstring(filename::AbstractString)\n",
+       "```\n",
+       "\n",
+       "Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n"
+      ],
+      "text/plain": [
+       "```\n",
+       "readstring(stream::IO)\n",
+       "readstring(filename::AbstractString)\n",
+       "```\n",
+       "\n",
+       "Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "?readstring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Julia 0.6.2",
+   "language": "julia",
+   "name": "julia-0.6"
+  },
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "0.6.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}