diff --git a/notebooks/mocker-proposal.ipynb b/notebooks/mocker-proposal.ipynb new file mode 100644 index 0000000..f3e1fdb --- /dev/null +++ b/notebooks/mocker-proposal.ipynb @@ -0,0 +1,582 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "simplified-office", + "metadata": {}, + "source": [ + "## Some example mock APIs" + ] + }, + { + "cell_type": "markdown", + "id": "concerned-holiday", + "metadata": {}, + "source": [ + "Looking for some high level feedback on these data mocks" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "cordless-prevention", + "metadata": {}, + "outputs": [], + "source": [ + "from mlfaker.generators import NormalGenerator, CategoricalGenerator, BaseGenerator\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "aerial-subscriber", + "metadata": {}, + "outputs": [], + "source": [ + "def build_df(generators, size):\n", + " return pd.DataFrame([g.generate(size) for g in generators]).T" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "sustained-timber", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
foobarfizz
01.6243451.0NaN
1-0.611756NaNa
2-0.528172NaNNaN
3-1.072969NaNNaN
40.865408NaNb
5-2.3015391.0a
61.7448121.0NaN
\n", + "
" + ], + "text/plain": [ + " foo ... fizz\n", + "0 1.624345 ... NaN\n", + "1 -0.611756 ... a\n", + "2 -0.528172 ... NaN\n", + "3 -1.072969 ... NaN\n", + "4 0.865408 ... b\n", + "5 -2.301539 ... a\n", + "6 1.744812 ... NaN\n", + "\n", + "[7 rows x 3 columns]" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "build_df(\n", + " [\n", + " NormalGenerator(\"foo\"),\n", + " CategoricalGenerator(\"bar\", fillrate=0.5), \n", + " CategoricalGenerator(\"fizz\", classes=[\"a\", \"b\", \"c\"], fillrate=0.5)\n", + " ],\n", + " 7\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "damaged-memory", + "metadata": {}, + "outputs": [], + "source": [ + "class DataFramer():\n", + " def __init__(self, generators):\n", + " self.generators = generators\n", + " \n", + " def generate(self, size):\n", + " return pd.DataFrame([g.generate(size) for g in self.generators]).T" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "applied-shopping", + "metadata": {}, + "outputs": [], + "source": [ + "dataframer = DataFramer(\n", + " [\n", + " NormalGenerator(\"mike\"),\n", + " CategoricalGenerator(\"jeff\", fillrate=0.5, seed=10),\n", + " NormalGenerator(\"tom\", fillrate=0.9, loc=3),\n", + " CategoricalGenerator(\"target\", classes=[0, 1], fillrate=0.5)\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "sacred-sodium", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mikejefftomtarget
00.900856NaN3.7050720.0
1-0.683728NaN1.3200581.0
2-0.1228900.04.9999760.0
3-0.9357691.0NaNNaN
4-0.2678881.03.193118NaN
50.530355NaN1.446993NaN
\n", + "
" + ], + "text/plain": [ + " mike ... target\n", + "0 0.900856 ... 0.0\n", + "1 -0.683728 ... 1.0\n", + "2 -0.122890 ... 0.0\n", + "3 -0.935769 ... NaN\n", + "4 -0.267888 ... NaN\n", + "5 0.530355 ... NaN\n", + "\n", + "[6 rows x 4 columns]" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataframer.generate(6)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "serial-doctor", + "metadata": {}, + "outputs": [], + "source": [ + "def build_df_from_cols(cols_types , size=10):\n", + " lookup = {\"normal\": NormalGenerator, \"categorical\": CategoricalGenerator}\n", + " return pd.DataFrame([lookup[v](k).generate(size) for k, v in cols_types.items()]).T" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "continuing-crime", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
foobar
01.6243451.0
1-0.6117561.0
2-0.5281720.0
3-1.0729690.0
40.8654081.0
\n", + "
" + ], + "text/plain": [ + " foo bar\n", + "0 1.624345 1.0\n", + "1 -0.611756 1.0\n", + "2 -0.528172 0.0\n", + "3 -1.072969 0.0\n", + "4 0.865408 1.0" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "build_df_from_cols({\"foo\": \"normal\", \"bar\": \"categorical\"}, size=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "searching-technology", + "metadata": {}, + "outputs": [], + "source": [ + "def build_df_from_num_cat(nums, cats, size=10):\n", + " gens = []\n", + " counter = 0\n", + " for col_type, gen in zip([nums, cats], [NormalGenerator, CategoricalGenerator]):\n", + " if isinstance(col_type, int):\n", + " gen_holder = [gen(f\"col{counter+i}\") for i in range(col_type)]\n", + " else:\n", + " gen_holder = [gen(col) for col in col_type]\n", + " counter += len(gen_holder)\n", + " gens.extend(gen_holder)\n", + " \n", + " return pd.DataFrame([gen.generate(size) for gen in gens]).T" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "affiliated-burden", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col0col1col2col3
01.6243451.6243451.01.0
1-0.611756-0.6117561.01.0
2-0.528172-0.5281720.00.0
3-1.072969-1.0729690.00.0
40.8654080.8654081.01.0
\n", + "
" + ], + "text/plain": [ + " col0 ... col3\n", + "0 1.624345 ... 1.0\n", + "1 -0.611756 ... 1.0\n", + "2 -0.528172 ... 0.0\n", + "3 -1.072969 ... 0.0\n", + "4 0.865408 ... 1.0\n", + "\n", + "[5 rows x 4 columns]" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "build_df_from_num_cat(nums=2, cats=2, size=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "decreased-extension", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
foocol1col2
01.6243451.01.0
1-0.6117561.01.0
2-0.5281720.00.0
\n", + "
" + ], + "text/plain": [ + " foo ... col2\n", + "0 1.624345 ... 1.0\n", + "1 -0.611756 ... 1.0\n", + "2 -0.528172 ... 0.0\n", + "\n", + "[3 rows x 3 columns]" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "build_df_from_num_cat(nums=[\"foo\"], cats=2, size=3)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}