{ "cells": [ { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "### Библиотеки / данные" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "импортируем библиотеки numpy и pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "задаем некоторые настройки pandas, регулирующие формат вывода" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "pd.options.display.max_rows = 10" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "загружаем данные" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips = pd.read_csv(\"../data/tips.csv\")\n", "tips.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips['tip_pct'] = tips['tip'] / tips['total_bill']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.head()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true, "heading_collapsed": true }, "source": [ "### Механизм GroupBy" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "df = pd.DataFrame({'x': ['a','a','b','b','c','c'],\n", " 'y': [2,4,0,5,5,10]})\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "groups = df.groupby(['x'])\n", "groups" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "получаем информацию о количестве групп, которые будут созданы" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "groups.ngroups" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "получаем информацию о количестве элементов в каждой группе" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "groups.size()" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "что представляют из себя найденные группы?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "groups.groups" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "получаем данные конкретной группы" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "groups.get_group('b')" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "извлекаем первую строку каждой группы" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "groups.nth([1])" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "обход групп:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "for key, group in groups:\n", " print(key)\n", " print(group) " ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "вычисление среднего" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "groups.y.mean()" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "### Типы группировок" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "#### по столбцам: " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true, "scrolled": true }, "outputs": [], "source": [ "tips.groupby(['day','time']).tip.mean().unstack()" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "#### по уровням индекса" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "создаем копию данных и заново индексируем ее" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "copy_tips = tips.copy()\n", "copy_tips = copy_tips.set_index(['day', 'time'])\n", "copy_tips" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "группировать можем по одному или нескольким уровням индекса, передавая соответствующие значения столбцов аргументу level" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "copy_tips.groupby(level=['time']).sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "copy_tips.groupby(level = ['day', 'time']).mean()" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "#### с использованием функции " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "copy_tips.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "copy_tips.groupby(len, level='day').count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.day.value_counts()" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "#### с использованием массива " ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "массив случайных меток групп:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "np.random.seed(123)\n", "rnd_array = np.random.choice(['first_group', 'second_group'], \n", " size=tips.shape[0],\n", " p = [0.4, 0.6])\n", "rnd_array[0:5]" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "передаем массив меток групп в метод groupby" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.groupby(rnd_array).count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "93/(93+151)" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "#### бонус" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "тип данных:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips = tips.groupby(tips.dtypes, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips.size()" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "### Агрегирование групп" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "создаем объект groupby:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips = tips.groupby(['day', 'smoker'])['tip_pct']" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "проверка эквивалентности методов:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips.agg == group_tips.aggregate" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "собственная функция:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "def peak_to_peak(arr):\n", " return arr.max() - arr.min()\n", "\n", "group_tips.agg([peak_to_peak])" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "комбинирование:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips.agg([peak_to_peak, 'mean'])" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "задаем имена столбцов:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips.agg([('delta_max_min', peak_to_peak), ('mean_value','mean')])" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "отдельные наборы функций для каждого столбца:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips = tips.groupby(['day', 'smoker'])['tip_pct', 'total_bill']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips.agg({'tip_pct':[('max_value', np.max),\n", " ('min_value','min')],\n", " 'total_bill': 'sum'})" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "### Преобразование групп" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "#### метод transform" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'C', 'B', 'B', 'A'],\n", " 'Col2': [1, 2, 3, 4, 2, 5, 3]})\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "df['Col3'] = df.groupby('Col1').transform(sum)\n", "df.sort_values('Col1')" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "объект groupby:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "group_tips = tips.groupby(['smoker'])['total_bill']" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "нормирование:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "norm = lambda x: (x - x.mean())/x.std()\n", "group_tips.transform(norm)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips_copy = tips.copy()\n", "tips_copy['total_bill_norm'] = group_tips.transform(norm)\n", "tips_copy.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips_copy.groupby('smoker').total_bill_norm.agg(['mean', 'std'])" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "#### метод apply" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "функция:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "def top(df, n=5, column='tip_pct'):\n", " return df.sort_values(by=column)[-n:]\n", "top(tips, n=6)" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "применяем метод apply" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.groupby('smoker').apply(top)" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "### Исключение групп" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "создаем данные для наших примеров" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "df = pd.DataFrame({'Label': list('AABCCC'),\n", " 'Values': [1, 2, 3, 4, np.nan, 8]})\n", "df" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "удаляем группы с менее чем двумя возможными значениеми" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "f = lambda x: x.Values.count() > 1\n", "df.groupby('Label').filter(f)" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "удаляем группы, в которых есть пропуски" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "f = lambda x: x.Values.isnull().sum() == 0\n", "df.groupby('Label').filter(f)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true, "heading_collapsed": true }, "source": [ "### Сводная таблица" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "средние" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.pivot_table(index=['day', 'smoker'])" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "вывод по строкам и столбцам" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],\n", " columns='smoker')" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "включение частичных итогов:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],\n", " columns='smoker', margins=True)" ] }, { "cell_type": "markdown", "metadata": { "hidden": true }, "source": [ "задаем функцию:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',\n", " aggfunc=max, margins=True)" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 1 }