{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"### Библиотеки / данные"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"импортируем библиотеки numpy и pandas"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"задаем некоторые настройки pandas, регулирующие формат вывода"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"pd.options.display.max_rows = 10"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"загружаем данные"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips = pd.read_csv(\"../data/tips.csv\")\n",
"tips.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips['tip_pct'] = tips['tip'] / tips['total_bill']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"heading_collapsed": true
},
"source": [
"### Механизм GroupBy"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'x': ['a','a','b','b','c','c'],\n",
" 'y': [2,4,0,5,5,10]})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"groups = df.groupby(['x'])\n",
"groups"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"получаем информацию о количестве групп, которые будут созданы"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"groups.ngroups"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"получаем информацию о количестве элементов в каждой группе"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"groups.size()"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"что представляют из себя найденные группы?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"groups.groups"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"получаем данные конкретной группы"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"groups.get_group('b')"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"извлекаем первую строку каждой группы"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"groups.nth([1])"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"обход групп:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"for key, group in groups:\n",
" print(key)\n",
" print(group) "
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"вычисление среднего"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"groups.y.mean()"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"### Типы группировок"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"#### по столбцам: "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true,
"scrolled": true
},
"outputs": [],
"source": [
"tips.groupby(['day','time']).tip.mean().unstack()"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"#### по уровням индекса"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"создаем копию данных и заново индексируем ее"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"copy_tips = tips.copy()\n",
"copy_tips = copy_tips.set_index(['day', 'time'])\n",
"copy_tips"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"группировать можем по одному или нескольким уровням индекса, передавая соответствующие значения столбцов аргументу level"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"copy_tips.groupby(level=['time']).sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"copy_tips.groupby(level = ['day', 'time']).mean()"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"#### с использованием функции "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"copy_tips.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"copy_tips.groupby(len, level='day').count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.day.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"#### с использованием массива "
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"массив случайных меток групп:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"np.random.seed(123)\n",
"rnd_array = np.random.choice(['first_group', 'second_group'], \n",
" size=tips.shape[0],\n",
" p = [0.4, 0.6])\n",
"rnd_array[0:5]"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"передаем массив меток групп в метод groupby"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.groupby(rnd_array).count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"93/(93+151)"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"#### бонус"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"тип данных:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips = tips.groupby(tips.dtypes, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips.size()"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"### Агрегирование групп"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"создаем объект groupby:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips = tips.groupby(['day', 'smoker'])['tip_pct']"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"проверка эквивалентности методов:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips.agg == group_tips.aggregate"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"собственная функция:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"def peak_to_peak(arr):\n",
" return arr.max() - arr.min()\n",
"\n",
"group_tips.agg([peak_to_peak])"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"комбинирование:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips.agg([peak_to_peak, 'mean'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"задаем имена столбцов:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips.agg([('delta_max_min', peak_to_peak), ('mean_value','mean')])"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"отдельные наборы функций для каждого столбца:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips = tips.groupby(['day', 'smoker'])['tip_pct', 'total_bill']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips.agg({'tip_pct':[('max_value', np.max),\n",
" ('min_value','min')],\n",
" 'total_bill': 'sum'})"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"### Преобразование групп"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"#### метод transform"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'C', 'B', 'B', 'A'],\n",
" 'Col2': [1, 2, 3, 4, 2, 5, 3]})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"df['Col3'] = df.groupby('Col1').transform(sum)\n",
"df.sort_values('Col1')"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"объект groupby:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"group_tips = tips.groupby(['smoker'])['total_bill']"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"нормирование:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"norm = lambda x: (x - x.mean())/x.std()\n",
"group_tips.transform(norm)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips_copy = tips.copy()\n",
"tips_copy['total_bill_norm'] = group_tips.transform(norm)\n",
"tips_copy.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips_copy.groupby('smoker').total_bill_norm.agg(['mean', 'std'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"#### метод apply"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"функция:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"def top(df, n=5, column='tip_pct'):\n",
" return df.sort_values(by=column)[-n:]\n",
"top(tips, n=6)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"применяем метод apply"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.groupby('smoker').apply(top)"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"### Исключение групп"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"создаем данные для наших примеров"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'Label': list('AABCCC'),\n",
" 'Values': [1, 2, 3, 4, np.nan, 8]})\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"удаляем группы с менее чем двумя возможными значениеми"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"f = lambda x: x.Values.count() > 1\n",
"df.groupby('Label').filter(f)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"удаляем группы, в которых есть пропуски"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"f = lambda x: x.Values.isnull().sum() == 0\n",
"df.groupby('Label').filter(f)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"heading_collapsed": true
},
"source": [
"### Сводная таблица"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"средние"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.pivot_table(index=['day', 'smoker'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"вывод по строкам и столбцам"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],\n",
" columns='smoker')"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"включение частичных итогов:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],\n",
" columns='smoker', margins=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"задаем функцию:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"hidden": true
},
"outputs": [],
"source": [
"tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',\n",
" aggfunc=max, margins=True)"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 1
}