diff --git a/python-data/notebooks/ex01_numpy_arrays.ipynb b/python-data/notebooks/ex01_numpy_arrays.ipynb index 1a3cc78..c74be41 100644 --- a/python-data/notebooks/ex01_numpy_arrays.ipynb +++ b/python-data/notebooks/ex01_numpy_arrays.ipynb @@ -143,7 +143,7 @@ "\n", "- Create an array of shape (2, 3, 4) of zeros\n", "- Create an array of shape (2, 3, 4) of ones.\n", - "- Create an array with values 0 to 999 using the `np.arrange` function." + "- Create an array with values 0 to 999 using the `np.arange` function." ] }, { diff --git a/python-data/slides/02_python_text_formats.pdf b/python-data/slides/02_python_text_formats.pdf index 655cb24..bec9544 100644 Binary files a/python-data/slides/02_python_text_formats.pdf and b/python-data/slides/02_python_text_formats.pdf differ diff --git a/python-data/slides/02_python_text_formats.pptx b/python-data/slides/02_python_text_formats.pptx index 099d47b..7f8b006 100644 Binary files a/python-data/slides/02_python_text_formats.pptx and b/python-data/slides/02_python_text_formats.pptx differ diff --git a/python-data/slides/04_binary_formats.pdf b/python-data/slides/04_binary_formats.pdf index 75ad03b..f22cd0a 100644 Binary files a/python-data/slides/04_binary_formats.pdf and b/python-data/slides/04_binary_formats.pdf differ diff --git a/python-data/slides/04_binary_formats.pptx b/python-data/slides/04_binary_formats.pptx index 1859e30..bbe2d01 100644 Binary files a/python-data/slides/04_binary_formats.pptx and b/python-data/slides/04_binary_formats.pptx differ diff --git a/python-data/slides/05_netcdf_overview.pdf b/python-data/slides/05_netcdf_overview.pdf index 1fa0bb7..ae55f18 100644 Binary files a/python-data/slides/05_netcdf_overview.pdf and b/python-data/slides/05_netcdf_overview.pdf differ diff --git a/python-data/slides/05_netcdf_overview.pptx b/python-data/slides/05_netcdf_overview.pptx index 6c98780..06e2b0a 100644 Binary files a/python-data/slides/05_netcdf_overview.pptx and b/python-data/slides/05_netcdf_overview.pptx differ diff --git a/python-data/slides/07_ncgen_ncdump_cdl.pdf b/python-data/slides/07_ncgen_ncdump_cdl.pdf index ef55bad..923e50c 100644 Binary files a/python-data/slides/07_ncgen_ncdump_cdl.pdf and b/python-data/slides/07_ncgen_ncdump_cdl.pdf differ diff --git a/python-data/slides/07_ncgen_ncdump_cdl.pptx b/python-data/slides/07_ncgen_ncdump_cdl.pptx index ed5f216..1f2837e 100644 Binary files a/python-data/slides/07_ncgen_ncdump_cdl.pptx and b/python-data/slides/07_ncgen_ncdump_cdl.pptx differ diff --git a/python-data/slides/09_cfchecker.pdf b/python-data/slides/09_cfchecker.pdf index 6f0e118..b69121a 100644 Binary files a/python-data/slides/09_cfchecker.pdf and b/python-data/slides/09_cfchecker.pdf differ diff --git a/python-data/slides/09_cfchecker.pptx b/python-data/slides/09_cfchecker.pptx index 8b3cc34..7dd9523 100644 Binary files a/python-data/slides/09_cfchecker.pptx and b/python-data/slides/09_cfchecker.pptx differ diff --git a/python-data/solutions/ex01_numpy_arrays_solutions.ipynb b/python-data/solutions/ex01_numpy_arrays_solutions.ipynb index 8947b66..09ec90c 100644 --- a/python-data/solutions/ex01_numpy_arrays_solutions.ipynb +++ b/python-data/solutions/ex01_numpy_arrays_solutions.ipynb @@ -173,7 +173,7 @@ "\n", "- Create an array of shape (2, 3, 4) of zeros and print.\n", "- Create an array of shape (2, 3, 4) of ones and print.\n", - "- Create an array with values 0 to 999 using the `np.arrange` function and print." + "- Create an array with values 0 to 999 using the `np.arange` function and print." ] }, { diff --git a/python-data/solutions/ex05_pandas.ipynb b/python-data/solutions/ex05_pandas.ipynb new file mode 100644 index 0000000..47c1975 --- /dev/null +++ b/python-data/solutions/ex05_pandas.ipynb @@ -0,0 +1,1680 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1a4fb627-2190-43ac-b4b5-93352cd23aa4", + "metadata": {}, + "source": [ + "# Working with Pandas DataFrames in Python\n", + "\n", + "Part of: Data Analysis and Visualization in Python for EcologistsData Analysis and Visualization in Python for Ecologists (**Data Carpentry**)\n", + "\n", + "From: https://datacarpentry.org/python-ecology-lesson/02-starting-with-data/index.htmlhttps://datacarpentry.org/python-ecology-lesson/02-starting-with-data/index.html\n", + "\n", + "teaching: 30 mins\n", + "exercises: 30 mins\n", + "\n", + "Questions:\n", + "- \"How can I import data in Python?\"\n", + "- \"What is Pandas?\"\n", + "- \"Why should I use Pandas to work with data?\"\n", + " \n", + "Objectives:\n", + "- \"Navigate the workshop directory and download a dataset.\"\n", + "- \"Explain what a library is and what libraries are used for.\"\n", + "- \"Describe what the Python Data Analysis Library (Pandas) is.\"\n", + "- \"Load the Python Data Analysis Library (Pandas).\"\n", + "- \"Use `read_csv` to read tabular data into Python.\"\n", + "- \"Describe what a DataFrame is in Python.\"\n", + "- \"Access and summarize data stored in a DataFrame.\"\n", + "- \"Define indexing as it relates to data structures.\"\n", + "- \"Perform basic mathematical operations and summary statistics on data in a Pandas DataFrame.\"\n", + "- \"Create simple plots.\"\n", + " \n", + "Key points:\n", + "- \"Libraries enable us to extend the functionality of Python.\"\n", + "- \"Pandas is a popular library for working with data.\"\n", + "- \"A Dataframe is a Pandas data structure that allows one to access data by column (name or index) or row.\"\n", + "- \"Aggregating data using the `groupby()` function enables you to generate useful summaries of data quickly.\"\n", + "- \"Plots can be created from DataFrames or subsets of data that have been generated with `groupby()`.\"\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "id": "3c62fdbe-6cd7-473a-b1d8-288bd5dec187", + "metadata": {}, + "source": [ + "We can automate the process of performing data manipulations in Python. It's efficient to spend time\n", + "building the code to perform these tasks because once it's built, we can use it\n", + "over and over on different datasets that use a similar format. This makes our\n", + "methods easily reproducible. We can also easily share our code with colleagues\n", + "and they can replicate the same analysis.\n", + "\n", + "### Starting in the same spot\n", + "\n", + "To help the lesson run smoothly, let's ensure everyone is in the same directory.\n", + "This should help us avoid path and file name issues. At this time please\n", + "navigate to the workshop directory. If you are working in Jupyter Notebook be sure\n", + "that you start your notebook in the workshop directory.\n", + "\n", + "A quick aside that there are Python libraries like [OS Library][os-lib] that can work with our\n", + "directory structure, however, that is not our focus today." + ] + }, + { + "cell_type": "markdown", + "id": "d3fd3a90-3455-4df0-a4f4-9a327d81b177", + "metadata": {}, + "source": [ + "### Our Data\n", + "\n", + "For this lesson, we will be using the Portal Teaching data, a subset of the data\n", + "from Ernst et al\n", + "[Long-term monitoring and experimental manipulation of a Chihuahuan Desert ecosystem near Portal,\n", + "Arizona, USA][ernst].\n", + "\n", + "We will be using files from the [Portal Project Teaching Database][pptd].\n", + "This section will use the `surveys.csv` file that can be downloaded here:\n", + "[https://ndownloader.figshare.com/files/2292172][figshare-ndownloader]\n", + "\n", + "We are studying the species and weight of animals caught in sites in our study\n", + "area. The dataset is stored as a `.csv` file: each row holds information for a\n", + "single animal, and the columns represent:\n", + "\n", + "| Column | Description |\n", + "|------------------|------------------------------------|\n", + "| record_id | Unique id for the observation |\n", + "| month | month of observation |\n", + "| day | day of observation |\n", + "| year | year of observation |\n", + "| plot_id | ID of a particular site |\n", + "| species_id | 2-letter code |\n", + "| sex | sex of animal (\"M\", \"F\") |\n", + "| hindfoot_length | length of the hindfoot in mm |\n", + "| weight | weight of the animal in grams |\n", + "\n", + "\n", + "The first few rows of our first file look like this:\n", + "\n", + "```\n", + "record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight\n", + "1,7,16,1977,2,NL,M,32,\n", + "2,7,16,1977,3,NL,M,33,\n", + "3,7,16,1977,2,DM,F,37,\n", + "4,7,16,1977,7,DM,M,36,\n", + "5,7,16,1977,3,DM,M,35,\n", + "6,7,16,1977,1,PF,M,14,\n", + "7,7,16,1977,2,PE,F,,\n", + "8,7,16,1977,1,DM,M,37,\n", + "9,7,16,1977,1,DM,F,34,\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0e8a0333-7d52-4739-a9d7-ee2388a07033", + "metadata": {}, + "outputs": [], + "source": [ + "# Download the file\n", + "import requests\n", + "url = \"https://ndownloader.figshare.com/files/2292172\"\n", + "content = requests.get(url).text\n", + "\n", + "datafile = \"surveys.csv\"\n", + "with open(datafile, \"w\") as csv:\n", + " csv.write(content)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f73c654d-65fc-40bc-a02f-b708eec455dd", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "6c9ee6ef-0b55-404e-8757-678e7a760dd1", + "metadata": {}, + "source": [ + "Each time we call a function that's in a library, we use the syntax\n", + "`LibraryName.FunctionName`. Adding the library name with a `.` before the\n", + "function name tells Python where to find the function. In the example above, we\n", + "have imported Pandas as `pd`. This means we don't have to type out `pandas` each\n", + "time we call a Pandas function.\n", + "\n", + "\n", + "# Reading CSV Data Using Pandas\n", + "\n", + "We will begin by locating and reading our survey data which are in CSV format. CSV stands for\n", + "Comma-Separated Values and is a common way to store formatted data. Other symbols may also be used, so\n", + "you might see tab-separated, colon-separated or space separated files. It is quite easy to replace\n", + "one separator with another, to match your application. The first line in the file often has headers\n", + "to explain what is in each column. CSV (and other separators) make it easy to share data, and can be\n", + "imported and exported from many applications, including Microsoft Excel. For more details on CSV\n", + "files, see the [Data Organisation in Spreadsheets][spreadsheet-lesson5] lesson.\n", + "We can use Pandas' `read_csv` function to pull the file directly into a [DataFrame][pd-dataframe].\n", + "\n", + "## So What's a DataFrame?\n", + "\n", + "A DataFrame is a 2-dimensional data structure that can store data of different\n", + "types (including characters, integers, floating point values, factors and more)\n", + "in columns. It is similar to a spreadsheet or an SQL table or the `data.frame` in\n", + "R. A DataFrame always has an index (0-based). An index refers to the position of\n", + "an element in the data structure." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "eb36110d-ebc1-4376-8e13-abe486c128f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_idmonthdayyearplot_idspecies_idsexhindfoot_lengthweight
0171619772NLM32.0NaN
1271619773NLM33.0NaN
2371619772DMF37.0NaN
3471619777DMM36.0NaN
4571619773DMM35.0NaN
..............................
35544355451231200215AHNaNNaNNaN
35545355461231200215AHNaNNaNNaN
35546355471231200210RMF15.014.0
3554735548123120027DOM36.051.0
3554835549123120025NaNNaNNaNNaN
\n", + "

35549 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " record_id month day year plot_id species_id sex hindfoot_length \\\n", + "0 1 7 16 1977 2 NL M 32.0 \n", + "1 2 7 16 1977 3 NL M 33.0 \n", + "2 3 7 16 1977 2 DM F 37.0 \n", + "3 4 7 16 1977 7 DM M 36.0 \n", + "4 5 7 16 1977 3 DM M 35.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "35544 35545 12 31 2002 15 AH NaN NaN \n", + "35545 35546 12 31 2002 15 AH NaN NaN \n", + "35546 35547 12 31 2002 10 RM F 15.0 \n", + "35547 35548 12 31 2002 7 DO M 36.0 \n", + "35548 35549 12 31 2002 5 NaN NaN NaN \n", + "\n", + " weight \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "35544 NaN \n", + "35545 NaN \n", + "35546 14.0 \n", + "35547 51.0 \n", + "35548 NaN \n", + "\n", + "[35549 rows x 9 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note that pd.read_csv is used because we imported pandas as pd\n", + "pd.read_csv(datafile)" + ] + }, + { + "cell_type": "markdown", + "id": "3686eebc-e266-4496-99ff-475f69ec72ce", + "metadata": {}, + "source": [ + "We can see that there were 35,549 rows parsed. Each row has 9\n", + "columns. The first column is the index of the DataFrame. The index is used to\n", + "identify the position of the data, but it is not an actual column of the DataFrame.\n", + "It looks like the `read_csv` function in Pandas read our file properly. However,\n", + "we haven't saved any data to memory so we can work with it. We need to assign the\n", + "DataFrame to a variable. Remember that a variable is a name for a value, such as `x`,\n", + "or `data`. We can create a new object with a variable name by assigning a value to it using `=`.\n", + "\n", + "Let's call the imported survey data `surveys_df`:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9c692fd0-32cf-4dad-be80-64458c563ab1", + "metadata": {}, + "outputs": [], + "source": [ + "surveys_df = pd.read_csv(datafile)" + ] + }, + { + "cell_type": "markdown", + "id": "dcefc541-b4ac-4ebc-93ed-8348180f7779", + "metadata": {}, + "source": [ + "Notice when you assign the imported DataFrame to a variable, Python does not\n", + "produce any output on the screen. We can view the value of the `surveys_df`\n", + "object by typing its name into the Python command prompt." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5429befb-4dba-4251-9071-a259e96738e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_idmonthdayyearplot_idspecies_idsexhindfoot_lengthweight
0171619772NLM32.0NaN
1271619773NLM33.0NaN
2371619772DMF37.0NaN
3471619777DMM36.0NaN
4571619773DMM35.0NaN
..............................
35544355451231200215AHNaNNaNNaN
35545355461231200215AHNaNNaNNaN
35546355471231200210RMF15.014.0
3554735548123120027DOM36.051.0
3554835549123120025NaNNaNNaNNaN
\n", + "

35549 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " record_id month day year plot_id species_id sex hindfoot_length \\\n", + "0 1 7 16 1977 2 NL M 32.0 \n", + "1 2 7 16 1977 3 NL M 33.0 \n", + "2 3 7 16 1977 2 DM F 37.0 \n", + "3 4 7 16 1977 7 DM M 36.0 \n", + "4 5 7 16 1977 3 DM M 35.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "35544 35545 12 31 2002 15 AH NaN NaN \n", + "35545 35546 12 31 2002 15 AH NaN NaN \n", + "35546 35547 12 31 2002 10 RM F 15.0 \n", + "35547 35548 12 31 2002 7 DO M 36.0 \n", + "35548 35549 12 31 2002 5 NaN NaN NaN \n", + "\n", + " weight \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "35544 NaN \n", + "35545 NaN \n", + "35546 14.0 \n", + "35547 51.0 \n", + "35548 NaN \n", + "\n", + "[35549 rows x 9 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "surveys_df" + ] + }, + { + "cell_type": "markdown", + "id": "3c8b9fbf-5b0a-44b2-ae9b-2cbc094f2eeb", + "metadata": {}, + "source": [ + "Note: if the output is too wide to print on your narrow terminal window, you may see something\n", + "slightly different as the large set of data scrolls past. You may see simply the last column\n", + "of data.\n", + "\n", + "Never fear, all the data is there, if you scroll up. Selecting just a few rows, so it is\n", + "easier to fit on one window, you can see that pandas has neatly formatted the data to fit\n", + "our screen:Never fear, all the data is there, if you scroll up. Selecting just a few rows, so it is\n", + "easier to fit on one window, you can see that pandas has neatly formatted the data to fit\n", + "our screen:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "7fa897f3-1237-434e-a4dd-44c1654578bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_idmonthdayyearplot_idspecies_idsexhindfoot_lengthweight
0171619772NLM32.0NaN
1271619773NLM33.0NaN
2371619772DMF37.0NaN
3471619777DMM36.0NaN
4571619773DMM35.0NaN
\n", + "
" + ], + "text/plain": [ + " record_id month day year plot_id species_id sex hindfoot_length \\\n", + "0 1 7 16 1977 2 NL M 32.0 \n", + "1 2 7 16 1977 3 NL M 33.0 \n", + "2 3 7 16 1977 2 DM F 37.0 \n", + "3 4 7 16 1977 7 DM M 36.0 \n", + "4 5 7 16 1977 3 DM M 35.0 \n", + "\n", + " weight \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "surveys_df.head() # The head() method displays the first several lines of a file. It\n", + " # is discussed below." + ] + }, + { + "cell_type": "markdown", + "id": "e1a68826-2a82-4700-8f3f-b543fb1c3c13", + "metadata": {}, + "source": [ + "## Exploring our Species Survey Data\n", + "\n", + "Again, we can use the `type` function to see what kind of thing `surveys_df` is:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "86fe32ef-6617-479c-9776-c0efa316e65f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.frame.DataFrame" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(surveys_df)" + ] + }, + { + "cell_type": "markdown", + "id": "7002ed40-3eca-49f9-8bb9-541e37c332e8", + "metadata": {}, + "source": [ + "As expected, it's a DataFrame (or, to use the full name that Python uses to refer\n", + "to it internally, a `pandas.core.frame.DataFrame`).\n", + "\n", + "What kind of things does `surveys_df` contain? DataFrames have an attribute\n", + "called `dtypes` that answers this:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c586df32-ef8e-45c3-ad46-21c7f6aab133", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "record_id int64\n", + "month int64\n", + "day int64\n", + "year int64\n", + "plot_id int64\n", + "species_id object\n", + "sex object\n", + "hindfoot_length float64\n", + "weight float64\n", + "dtype: object" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "surveys_df.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "38184683-6906-4893-8289-3d756b1fbf08", + "metadata": {}, + "source": [ + "All the values in a column have the same type. For example, months have type\n", + "`int64`, which is a kind of integer. Cells in the month column cannot have\n", + "fractional values, but the weight and hindfoot_length columns can, because they\n", + "have type `float64`. The `object` type doesn't have a very helpful name, but in\n", + "this case it represents strings (such as 'M' and 'F' in the case of sex).\n", + "\n", + "We'll talk a bit more about what the different formats mean in a different lesson.\n", + "\n", + "### Useful Ways to View DataFrame objects in Python\n", + "\n", + "There are many ways to summarize and access the data stored in DataFrames,\n", + "using attributes and methods provided by the DataFrame object.\n", + "\n", + "To access an attribute, use the DataFrame object name followed by the attribute\n", + "name `df_object.attribute`. Using the DataFrame `surveys_df` and attribute\n", + "`columns`, an index of all the column names in the DataFrame can be accessed\n", + "with `surveys_df.columns`.\n", + "\n", + "Methods are called in a similar fashion using the syntax `df_object.method()`.\n", + "As an example, `surveys_df.head()` gets the first few rows in the DataFrame\n", + "`surveys_df` using **the `head()` method**. With a method, we can supply extra\n", + "information in the parens to control behaviour.\n", + "\n", + "Let's look at the data using these.\n", + "\n", + "> ## Challenge - DataFrames\n", + ">\n", + "> Using our DataFrame `surveys_df`, try out the attributes & methods below to see\n", + "> what they return.\n", + ">\n", + "> 1. `surveys_df.columns`\n", + "> 2. `surveys_df.shape` Take note of the output of `shape` - what format does it\n", + "> return the shape of the DataFrame in?\n", + ">\n", + "> HINT: [More on tuples, here][python-datastructures].\n", + "> 3. `surveys_df.head()` Also, what does `surveys_df.head(15)` do?\n", + "> 4. `surveys_df.tail()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65d9b5c5-187b-495c-af05-1d29ae10e20a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "e52d475d-1ab0-4d63-b40f-79d4c101d6eb", + "metadata": {}, + "source": [ + "## Calculating Statistics From Data In A Pandas DataFrame\n", + "\n", + "We've read our data into Python. Next, let's perform some quick summary\n", + "statistics to learn more about the data that we're working with. We might want\n", + "to know how many animals were collected in each site, or how many of each\n", + "species were caught. We can perform summary stats quickly using groups. But\n", + "first we need to figure out what we want to group by.\n", + "\n", + "Let's begin by exploring our data:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "9a898fab-d2cb-42c2-bf7d-231b1c674c1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['record_id', 'month', 'day', 'year', 'plot_id', 'species_id', 'sex',\n", + " 'hindfoot_length', 'weight'],\n", + " dtype='object')" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Look at the column names\n", + "surveys_df.columns" + ] + }, + { + "cell_type": "markdown", + "id": "f62f48b9-2915-49a4-8b8d-2249f5f61698", + "metadata": {}, + "source": [ + "Let's get a list of all the species. The `pd.unique` function tells us all of\n", + "the unique values in the `species_id` column." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "026c7b71-f64a-4dd8-9c52-8b539c15f36c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['NL', 'DM', 'PF', 'PE', 'DS', 'PP', 'SH', 'OT', 'DO', 'OX', 'SS',\n", + " 'OL', 'RM', nan, 'SA', 'PM', 'AH', 'DX', 'AB', 'CB', 'CM', 'CQ',\n", + " 'RF', 'PC', 'PG', 'PH', 'PU', 'CV', 'UR', 'UP', 'ZL', 'UL', 'CS',\n", + " 'SC', 'BA', 'SF', 'RO', 'AS', 'SO', 'PI', 'ST', 'CU', 'SU', 'RX',\n", + " 'PB', 'PL', 'PX', 'CT', 'US'], dtype=object)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.unique(surveys_df['species_id'])" + ] + }, + { + "cell_type": "markdown", + "id": "5aa04d1b-5743-4f76-b6e9-b72dbbec3db8", + "metadata": {}, + "source": [ + "> ## Challenge - Statistics\n", + ">\n", + "> 1. Create a list of unique site ID's (\"plot_id\") found in the surveys data. Call it\n", + "> `site_names`. How many unique sites are there in the data? How many unique\n", + "> species are in the data?\n", + ">\n", + "> 2. What is the difference between `len(site_names)` and `surveys_df['plot_id'].nunique()`?" + ] + }, + { + "cell_type": "markdown", + "id": "77564088-9128-4d94-ace5-110e1518f86b", + "metadata": {}, + "source": [ + "# Groups in Pandas\n", + "\n", + "We often want to calculate summary statistics grouped by subsets or attributes\n", + "within fields of our data. For example, we might want to calculate the average\n", + "weight of all individuals per site.\n", + "\n", + "We can calculate basic statistics for all records in a single column using the\n", + "syntax below:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9f78f9f4-e387-405f-9dfa-1549eea21769", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 32283.000000\n", + "mean 42.672428\n", + "std 36.631259\n", + "min 4.000000\n", + "25% 20.000000\n", + "50% 37.000000\n", + "75% 48.000000\n", + "max 280.000000\n", + "Name: weight, dtype: float64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "surveys_df['weight'].describe()" + ] + }, + { + "cell_type": "markdown", + "id": "3f9cccd0-5861-42c8-ac47-94d241760afe", + "metadata": {}, + "source": [ + "We can also extract one specific metric if we wish:\n", + "\n", + "```\n", + "surveys_df['weight'].min()\n", + "surveys_df['weight'].max()\n", + "surveys_df['weight'].mean()\n", + "surveys_df['weight'].std()\n", + "surveys_df['weight'].count()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "2546bf29-ae70-4897-ba4f-3892122b2d3b", + "metadata": {}, + "source": [ + "But if we want to summarize by one or more variables, for example sex, we can\n", + "use **Pandas' `.groupby` method**. Once we've created a groupby DataFrame, we\n", + "can quickly calculate summary statistics by a group of our choice." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "00fc49b2-c1a3-4db5-ba2c-0f6b2627c927", + "metadata": {}, + "outputs": [], + "source": [ + "# Group data by sex\n", + "grouped_data = surveys_df.groupby('sex')" + ] + }, + { + "cell_type": "markdown", + "id": "612c8906-b9a7-491e-8fb4-1a2388277119", + "metadata": {}, + "source": [ + "The **pandas function `describe`** will return descriptive stats including: mean,\n", + "median, max, min, std and count for a particular column in the data. Pandas'\n", + "`describe` function will only return summary values for columns containing\n", + "numeric data." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "421e6cc7-2f04-4732-93d3-c65eb844ec10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_idmonthdayyearplot_idhindfoot_lengthweight
sex
F18036.4120466.58304716.0071381990.64499711.44085428.83678042.170555
M17754.8356016.39266816.1842861990.48040111.09828229.70957842.995379
\n", + "
" + ], + "text/plain": [ + " record_id month day year plot_id \\\n", + "sex \n", + "F 18036.412046 6.583047 16.007138 1990.644997 11.440854 \n", + "M 17754.835601 6.392668 16.184286 1990.480401 11.098282 \n", + "\n", + " hindfoot_length weight \n", + "sex \n", + "F 28.836780 42.170555 \n", + "M 29.709578 42.995379 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Summary statistics for all numeric columns by sex\n", + "grouped_data.describe()\n", + "# Provide the mean for each numeric column by sex\n", + "grouped_data.mean()" + ] + }, + { + "cell_type": "markdown", + "id": "f32f9aeb-3a0e-45cf-a58d-e1e855a4f142", + "metadata": {}, + "source": [ + "The `groupby` command is powerful in that it allows us to quickly generate\n", + "summary stats.\n", + "\n", + "> ## Challenge - Summary Data\n", + ">\n", + "> 1. How many recorded individuals are female `F` and how many male `M`?\n", + "> 2. What happens when you group by two columns using the following syntax and\n", + "> then calculate mean values?\n", + "> - `grouped_data2 = surveys_df.groupby(['plot_id', 'sex'])`\n", + "> - `grouped_data2.mean()`\n", + "> 3. Summarize weight values for each site in your data. HINT: you can use the\n", + "> following syntax to only create summary statistics for one column in your data.\n", + "> `by_site['weight'].describe()`\n", + ">\n", + ">\n", + ">> ## Did you get #3 right?\n", + ">> **A Snippet of the Output from challenge 3 looks like:**\n", + ">>\n", + ">> ```\n", + ">> site\n", + ">> 1 count 1903.000000\n", + ">> mean 51.822911\n", + ">> std 38.176670\n", + ">> min 4.000000\n", + ">> 25% 30.000000\n", + ">> 50% 44.000000\n", + ">> 75% 53.000000\n", + ">> max 231.000000\n", + ">> ...\n", + ">> ```" + ] + }, + { + "cell_type": "markdown", + "id": "00512ec3-e3be-4fd9-a3b0-f99633a0737e", + "metadata": {}, + "source": [ + "## Quickly Creating Summary Counts in Pandas\n", + "\n", + "Let's next count the number of samples for each species. We can do this in a few\n", + "ways, but we'll use `groupby` combined with **a `count()` method**." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "56735fff-81c5-47bc-abe3-f57a206a2b8e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "species_id\n", + "AB 303\n", + "AH 437\n", + "AS 2\n", + "BA 46\n", + "CB 50\n", + "CM 13\n", + "CQ 16\n", + "CS 1\n", + "CT 1\n", + "CU 1\n", + "CV 1\n", + "DM 10596\n", + "DO 3027\n", + "DS 2504\n", + "DX 40\n", + "NL 1252\n", + "OL 1006\n", + "OT 2249\n", + "OX 12\n", + "PB 2891\n", + "PC 39\n", + "PE 1299\n", + "PF 1597\n", + "PG 8\n", + "PH 32\n", + "PI 9\n", + "PL 36\n", + "PM 899\n", + "PP 3123\n", + "PU 5\n", + "PX 6\n", + "RF 75\n", + "RM 2609\n", + "RO 8\n", + "RX 2\n", + "SA 75\n", + "SC 1\n", + "SF 43\n", + "SH 147\n", + "SO 43\n", + "SS 248\n", + "ST 1\n", + "SU 5\n", + "UL 4\n", + "UP 8\n", + "UR 10\n", + "US 4\n", + "ZL 2\n", + "Name: record_id, dtype: int64\n" + ] + } + ], + "source": [ + "# Count the number of samples by species\n", + "species_counts = surveys_df.groupby('species_id')['record_id'].count()\n", + "print(species_counts)" + ] + }, + { + "cell_type": "markdown", + "id": "91527637-b894-4ab7-89d3-3baaf6c7c564", + "metadata": {}, + "source": [ + "Or, we can also count just the rows that have the species \"DO\":" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "014171dd-62da-4bcb-95e0-98b4818085a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3027" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "surveys_df.groupby('species_id')['record_id'].count()['DO']" + ] + }, + { + "cell_type": "markdown", + "id": "d402cd18-b262-4f1c-9cf9-d95fd68de038", + "metadata": {}, + "source": [ + "> ## Challenge - Make a list\n", + ">\n", + "> What's another way to create a list of species and associated `count` of the\n", + "> records in the data? Hint: you can perform `count`, `min`, etc. functions on\n", + "> groupby DataFrames in the same way you can perform them on regular DataFrames." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7e8ae98-dd6c-4e70-b6ed-866d67160f8d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "2e4f40b1-6568-4db2-acfe-affbf491cfc1", + "metadata": {}, + "source": [ + "## Basic Math Functions\n", + "\n", + "If we wanted to, we could perform math on an entire column of our data. For\n", + "example let's multiply all weight values by 2. A more practical use of this might\n", + "be to normalize the data according to a mean, area, or some other value\n", + "calculated from our data." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a5acc742-0c2b-42d9-b91b-189e01f7f5bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 NaN\n", + "1 NaN\n", + "2 NaN\n", + "3 NaN\n", + "4 NaN\n", + " ... \n", + "35544 NaN\n", + "35545 NaN\n", + "35546 28.0\n", + "35547 102.0\n", + "35548 NaN\n", + "Name: weight, Length: 35549, dtype: float64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Multiply all weight values by 2\n", + "surveys_df['weight']*2" + ] + }, + { + "cell_type": "markdown", + "id": "d5e6fb78-a9b1-45b1-8205-08b93ab3c9fd", + "metadata": {}, + "source": [ + "# Quick & Easy Plotting Data Using Pandas\n", + "\n", + "We can plot our summary stats using Pandas, too." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "da1e26aa-ce14-42a5-854b-945c98b7c000", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Make sure figures appear inline in Ipython Notebook\n", + "%matplotlib inline\n", + "# Create a quick bar chart\n", + "species_counts.plot(kind='bar');" + ] + }, + { + "cell_type": "markdown", + "id": "0e4c591a-ca11-4535-aadf-d74e50d5fe1b", + "metadata": {}, + "source": [ + "Count per species site\n", + "\n", + "We can also look at how many animals were captured in each site:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "f6ec232f-9ec7-4ca0-b01f-9fc7c3947489", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEKCAYAAAD+XoUoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAT2ElEQVR4nO3dfbBcdX3H8feXBCmIIJDwGDCMjVJAiRIiVasoDsZHYgudYEeiQ02rCNSHqcE+oE7Thk6ltVqYRkGCChhRDCoP0igyWgQCRkJ4GFIIkEkkEXzAtkMlfPvH+UV2Nrt77917sze5v/dr5sye/Z3zPed37+5+9uxvd89GZiJJqsMu490BSdLgGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRWZPN4dGMqUKVNy+vTp490NSdqp3HHHHT/LzKnt7Tt86E+fPp2VK1eOdzckaacSEQ93and4R5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klSRHf7LWdKgTV/47a7L1i1+ywB7Io09j/QlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkWqOLWyp8qVpIZH+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUZ8iybEXEocBlwIPAMsCQzPx0R+wJfAaYD64A/zsyfl5pzgTOALcDZmXlDaT8WuBTYHbgWOCczc2z/JEljzTPVThzDOdJ/GvhwZv4ecDxwZkQcCSwEVmTmDGBFuU5ZNg84CpgDXBgRk8q2LgIWADPKNGcM/xZJ0hCGDP3M3JiZd5b5J4F7gUOAk4GlZbWlwNwyfzJwZWY+lZkPAWuB2RFxELBXZt5Sju4va6mRJA3AiH5EJSKmAy8DbgUOyMyN0DwxRMT+ZbVDgB+1lK0vbb8p8+3tnfazgOYVAYcddthv232JKUmjM+w3ciNiT+BrwF9k5q96rdqhLXu0b9uYuSQzZ2XmrKlTpw63i5KkIQwr9CNiV5rA/3Jmfr00P1aGbCiXm0r7euDQlvJpwIbSPq1DuyRpQIbz6Z0ALgbuzcwLWhZdA8wHFpfL5S3tl0fEBcDBNG/Y3paZWyLiyYg4nmZ46HTgM2P2l+xAHIaStKMazpj+q4B3AasjYlVp+xhN2C+LiDOAR4BTATJzTUQsA+6h+eTPmZm5pdS9j2c/snldmSRJAzJk6GfmD+g8Hg9wYpeaRcCiDu0rgaNH0kFJ0tgZ0ad3pNFw2Esaf4a+JiyfZKRtee4dSaqIoS9JFXF4RyPmsMnY8v+pQfJIX5IqYuhLUkUMfUmqiKEvSRXxjdwefINN0kTjkb4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRSaPdwc0etMXfrvrsnWL3zLAnkja0XmkL0kVMfQlqSJDDu9ExCXAW4FNmXl0afs48F5gc1ntY5l5bVl2LnAGsAU4OzNvKO3HApcCuwPXAudkZo7lHyONJ4fZtDMYzpH+pcCcDu3/nJkzy7Q18I8E5gFHlZoLI2JSWf8iYAEwo0ydtilJ2o6GDP3MvBl4YpjbOxm4MjOfysyHgLXA7Ig4CNgrM28pR/eXAXP77LMkqU+jGdP/QETcFRGXRMQ+pe0Q4NGWddaXtkPKfHu7JGmA+g39i4AXAjOBjcCnSnt0WDd7tHcUEQsiYmVErNy8eXO31SRJI9RX6GfmY5m5JTOfAT4HzC6L1gOHtqw6DdhQ2qd1aO+2/SWZOSszZ02dOrWfLkqSOugr9MsY/VbvAO4u89cA8yJit4g4nOYN29sycyPwZEQcHxEBnA4sH0W/JUl9GM5HNq8ATgCmRMR64DzghIiYSTNEsw74M4DMXBMRy4B7gKeBMzNzS9nU+3j2I5vXlUmSNEBDhn5mntah+eIe6y8CFnVoXwkcPaLeSZLGlOfekbTd+IW1HY+nYZCkihj6klQRQ1+SKuKYvqQJY9DvIeyM71l4pC9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kV8ZezdiA746/wSNuDj4Xtx9CXVL2anmQc3pGkihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIr4jVxpJ1TTN0g1tgx9SdpJjMWT/ZDDOxFxSURsioi7W9r2jYgbI+KBcrlPy7JzI2JtRNwfEW9saT82IlaXZf8aETGsHkqSxsxwxvQvBea0tS0EVmTmDGBFuU5EHAnMA44qNRdGxKRScxGwAJhRpvZtSpK2syFDPzNvBp5oaz4ZWFrmlwJzW9qvzMynMvMhYC0wOyIOAvbKzFsyM4HLWmokSQPS76d3DsjMjQDlcv/SfgjwaMt660vbIWW+vV2SNEBj/ZHNTuP02aO980YiFkTEyohYuXnz5jHrnCTVrt/Qf6wM2VAuN5X29cChLetNAzaU9mkd2jvKzCWZOSszZ02dOrXPLkqS2vUb+tcA88v8fGB5S/u8iNgtIg6necP2tjIE9GREHF8+tXN6S40kaUCG/Jx+RFwBnABMiYj1wHnAYmBZRJwBPAKcCpCZayJiGXAP8DRwZmZuKZt6H80ngXYHriuTJGmAhgz9zDyty6ITu6y/CFjUoX0lcPSIeidJGlOee0eSKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIP4xesbH4kWVJOxeP9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRTy1slQRT6ctj/QlqSKGviRVxNCXpIoY+pJUEUNfkirip3ckacDG81NUHulLUkUMfUmqyKhCPyLWRcTqiFgVEStL274RcWNEPFAu92lZ/9yIWBsR90fEG0fbeUnSyIzFkf7rMnNmZs4q1xcCKzJzBrCiXCcijgTmAUcBc4ALI2LSGOxfkjRM22N452RgaZlfCsxtab8yM5/KzIeAtcDs7bB/SVIXow39BL4TEXdExILSdkBmbgQol/uX9kOAR1tq15e2bUTEgohYGRErN2/ePMouSpK2Gu1HNl+VmRsiYn/gxoi4r8e60aEtO62YmUuAJQCzZs3quI4kaeRGdaSfmRvK5Sbgaprhmsci4iCAcrmprL4eOLSlfBqwYTT7lySNTN9H+hHxXGCXzHyyzJ8EfBK4BpgPLC6Xy0vJNcDlEXEBcDAwA7htFH1XJTwdsDR2RjO8cwBwdURs3c7lmXl9RNwOLIuIM4BHgFMBMnNNRCwD7gGeBs7MzC2j6r0kaUT6Dv3MfBA4pkP748CJXWoWAYv63ackaXT8Rq4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klSRgYd+RMyJiPsjYm1ELBz0/iWpZgMN/YiYBPwb8CbgSOC0iDhykH2QpJoN+kh/NrA2Mx/MzP8DrgROHnAfJKlakZmD21nEKcCczPzTcv1dwCsy8wNt6y0AFpSrLwbu77LJKcDP+ujKzlC3M/TROuus23HrXpCZU7dpzcyBTcCpwOdbrr8L+MwotrdyotbtDH20zjrrdr66QQ/vrAcObbk+Ddgw4D5IUrUGHfq3AzMi4vCIeA4wD7hmwH2QpGpNHuTOMvPpiPgAcAMwCbgkM9eMYpNLJnDdztBH66yzbierG+gbuZKk8eU3ciWpIoa+JFXE0Jekikz40I+IIyLixIjYs619zhB1syPiuDJ/ZER8KCLe3Mf+L+uj5tVlfycNsd4rImKvMr97RHwiIr4ZEedHxN496s6OiEO7Le9R95yIOD0i3lCuvzMiPhsRZ0bErkPUvjAiPhIRn46IT0XEn/fqo6TeImL/vuomwhu5EfGezPxCh/azgTOBe4GZwDmZubwsuzMzX95le+fRnB9oMnAj8ArgJuANwA2ZuahLXfvHTwN4HfBdgMx8e5e62zJzdpl/b+nz1cBJwDczc3GXujXAMeVTUUuA/wGuAk4s7X/Ype6XwH8D/wVcAXw1Mzd3Wret7ss0/5M9gF8AewJfL/uLzJzfpe5s4G3A94E3A6uAnwPvAN6fmTcNte9aRMT+mblpgPvbLzMfH9T+tpdyAHEuMBfY+i3UTcByYHFm/qKPbV6XmW/qsmyvsr9pwHWZeXnLsgsz8/1d6g4EzgOeAf4WOAv4I5qMOiczN3ap27e9CbgDeBnNY++JYf9h/XwLbEebgEe6tK8G9izz04GV5R8L8OMe21tN85HSPYBfAXuV9t2Bu3rU3Ql8CTgBeG253FjmX9uj7sct87cDU8v8c4HVPerubd1327JVvfZH8yrvJOBiYDNwPTAfeF6PurvK5WTgMWBSuR5D/F9Wt6y7B3BTmT9siNthb2AxcB/weJnuLW3P7/O+cl2PZXsB/wB8EXhn27ILe9QdCFxEczLB/YCPl795GXBQj7p926b9gHXAPsC+PermtP2PLgbuAi4HDuhRtxiYUuZnAQ8Ca4GHh7h/3gn8NfDCEf6vZwHfK4+JQ2kOoH5Z7uMv61G3J/BJYE1ZfzPwI+DdPWpuAD4KHNh2u3wUuLFH3cu7TMcCG3vUfa38P+fSfNfoa8BunR6LbXXX0wT9wnKbfbQ8Ds4ClveoewZ4qG36Tbl8cES3Sz8PnPGYyj+o07QaeKpLzT0d7kzXAxcwRCh2mi/Xe9XtAnyw3LlnlrYhbxDgJ+WBvh9tX6tu33/bsq8C7ynzXwBmlfkXAbf3qGt/gtgVeDvNUf/mHnV3A88pfX2SEkzA79DyBNShbnXLA2If4I7Wbfao84E8xO0HfB74O+AF5b73jV63Q8v894DjWu4vXb/OX/rzT8AjwG1lPwcP4359G80r5tOAR4FTSvuJwC096pYD76Y5iv4Q8DfADGAp8Pddau7vsb1ey7bQvBL/Xofpf3vUrWq7/lfAD2kew73uKz9umX+k1zbbln2k3M9e0nq7DHUbdNxWP0XjMdEcWc4sd+7WaTqwoUvNdynh29I2GbgM2NJjX7cCe5T5XVra9+51g7asN40mkD/bfsN2WX8dzVHXQ+XywNK+5xB3hL2BS2mGaW4tgfEgzTDKMcO543VYtnuPZR8s238YOBtYAXyOJtTP61F3Dk0YLqE5at/6RDUVuLlHnQ/kznV3dtv+EPu7D5hc5n/UtqzXK8rW/f0BcCHw0/L/XNDn/6XXffAnbddvL5e7APd1qfkO8Je0vNIBDqB5Av6PHvu6G5jRZdmjPerupSUbStt8mlcnDw/nbwP+bri3QVm+NVcuAJ7HCI/wf7udforGY6J5CfvqLssu7/FPOrDLslf12NduXdqntD5Ah9Hnt9DlyGSY9XsAhw9jvecBx9AcyXZ9ed+y/otG0aeDKUd5wPOBU4DZw6g7qqx7xAj25QO5c816miPgD9M8CUfLsl7DbGeV/+nraYag/gV4DfAJ4Is96rZ5wqMZ/pwDfKFH3S00Q4in0hwozC3tr6X3K4v/3PpYp3kv6IaWZR2f7GleQZ5P88T2c+CJcnueT++hslOAF3dZNrdH3T8Cb+jQPgd4oEfdJylDzm3tvwtcNczHxdtohrt+Opz1t6nvp8jJaRBT2wP5ibYH8j496ib0A5nmjcDWaet7QAcClw1RewLwFZr3dVYD19Kcxnxyj5or+7z9jqEZorsOOAL4NM0HANYAr+xR91KaoaFfAD+gHKTQvDI8u0fdETQfttizrX3OEP08gmbIaazq3rS990fz/uLRw6nbZjv93JhOTuM9UYaIJlJd2wN5h+3njlhHM9x4P/ANmuHSk1uW9Rqa67furAHX9dXPjtvq5x/v5DTeE8N4r8S6euoY3Sf1Jmxdp2mgZ9mURiIi7uq2iGZs3zrrtpqUmb8GyMx1EXECcFVEvKDUdTPR67Zh6GtHdgDwRpo35loFzZt91lm31U8jYmZmrgLIzF9HxFuBS4CX9NjXRK/bhqGvHdm3aF7SrmpfEBE3WWddi9OBp1sbMvNp4PSI+Pce+5rodduYEKdhkCQNz4Q/4Zok6VmGviRVxNCXpIoY+lKbiLgpImYNsc7HhrGdjp80iYhLI+KUfvsnjYahL/VnyNDPzFcOoiPSSBj6qlZETI+I+yJiaUTcFRFXRcQebeucFhGrI+LuiDi/tC0Gdo+IVeWHZbpt/9flMsovjN0TEd8G+vrFI2ksGPqq3YuBJZn5UpofzPntLx5FxME0J3d7Pc1pvY+LiLmZuZDmFM0zM/NPhrGPd5T9vAR4L+ArAI0bQ1+1ezQzf1jmvwS8umXZcTS/8rW5fBHmyzSnIh6p1wBXZOaWzNxA+flMaTwY+qpd+7cTW6+P6JwmI9yPNC4MfdXusIj4/TJ/Gs3527e6FXhtREyJiEll+ffLst9ExK7D3MfNwLyImBQRBwGvG4uOS/0w9FW7e4H55cyO+9L8wDkAmbkROJfmZwF/QnPe8uVl8RLgrl5v5La4GniA5vS4F/HsE4c0cJ57R9WKiOnAtzLz6PHuizQoHulLUkU80pdGISL2A1Z0WHRiZj4+6P5IQzH0JakiDu9IUkUMfUmqiKEvSRUx9CWpIoa+JFXk/wHAtX7qjHoiDQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "total_count = surveys_df.groupby('plot_id')['record_id'].nunique()\n", + "# Let's plot that too\n", + "total_count.plot(kind='bar')" + ] + }, + { + "cell_type": "markdown", + "id": "f2aa1252-3611-4c2a-aafa-12c0c892e43a", + "metadata": {}, + "source": [ + "> ## Challenge - Plots\n", + ">\n", + "> 1. Create a plot of average weight across all species per site.\n", + "> 2. Create a plot of total males versus total females for the entire dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "778470a0-3584-4b48-87c3-1d5f631bbe57", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5a2945e0-d51f-487d-81a6-44395601df1c", + "metadata": {}, + "source": [ + "> ## Summary Plotting Challenge\n", + ">\n", + "> Create a stacked bar plot, with weight on the Y axis, and the stacked variable\n", + "> being sex. The plot should show total weight by sex for each site. Some\n", + "> tips are below to help you solve this challenge:\n", + ">\n", + "> * For more information on pandas plots, see [pandas' documentation page on visualization][pandas-plot].\n", + "> * You can use the code that follows to create a stacked bar plot but the data to stack\n", + "> need to be in individual columns. Here's a simple example with some data where\n", + "> 'a', 'b', and 'c' are the groups, and 'one' and 'two' are the subgroups.\n", + ">\n", + "> ```\n", + "> d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}\n", + "> pd.DataFrame(d)\n", + "> ```\n", + ">\n", + "> shows the following data\n", + ">\n", + "> ```\n", + "> one two\n", + "> a 1 1\n", + "> b 2 2\n", + "> c 3 3\n", + "> d NaN 4\n", + "> ```\n", + ">\n", + "> We can plot the above with\n", + ">\n", + "> ```\n", + "> # Plot stacked data so columns 'one' and 'two' are stacked\n", + "> my_df = pd.DataFrame(d)\n", + "> my_df.plot(kind='bar', stacked=True, title=\"The title of my graph\")\n", + "> ```\n", + ">\n", + "> * You can use the `.unstack()` method to transform grouped data into columns\n", + "> for each plotting. Try running `.unstack()` on some DataFrames above and see\n", + "> what it yields.\n", + ">\n", + "> Start by transforming the grouped data (by site and sex) into an unstacked layout, then create a stacked plot.\n", + ">" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1c9d455-6187-40ac-9953-b4ec0ac4b95d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "fb555c44-643c-4287-b80a-608ee0344b55", + "metadata": {}, + "source": [ + ">> ## Solution to Summary Challenge\n", + ">>\n", + ">> First we group data by site and by sex, and then calculate a total for each site.\n", + ">>\n", + ">> ```\n", + ">> by_site_sex = surveys_df.groupby(['plot_id', 'sex'])\n", + ">> site_sex_count = by_site_sex['weight'].sum()\n", + ">> ```\n", + ">>\n", + ">> This calculates the sums of weights for each sex within each site as a table\n", + ">>\n", + ">> ```\n", + ">> site sex\n", + ">> plot_id sex\n", + ">> 1 F 38253\n", + ">> M 59979\n", + ">> 2 F 50144\n", + ">> M 57250\n", + ">> 3 F 27251\n", + ">> M 28253\n", + ">> 4 F 39796\n", + ">> M 49377\n", + ">> \n", + ">> ```\n", + ">>\n", + ">> Below we'll use `.unstack()` on our grouped data to figure out the total weight that each sex contributed to each site.\n", + ">>\n", + ">> ```\n", + ">> by_site_sex = surveys_df.groupby(['plot_id', 'sex'])\n", + ">> site_sex_count = by_site_sex['weight'].sum()\n", + ">> site_sex_count.unstack()\n", + ">> ```\n", + ">>\n", + ">> The `unstack` method above will display the following output:\n", + ">>\n", + ">> ```\n", + ">> sex F M\n", + ">> plot_id\n", + ">> 1 38253 59979\n", + ">> 2 50144 57250\n", + ">> 3 27251 28253\n", + ">> 4 39796 49377\n", + ">> \n", + ">> ```\n", + ">>\n", + ">> Now, create a stacked bar plot with that data where the weights for each sex are stacked by site.\n", + ">>\n", + ">> Rather than display it as a table, we can plot the above data by stacking the values of each sex as follows:\n", + ">>\n", + ">> ```\n", + ">> by_site_sex = surveys_df.groupby(['plot_id', 'sex'])\n", + ">> site_sex_count = by_site_sex['weight'].sum()\n", + ">> spc = site_sex_count.unstack()\n", + ">> s_plot = spc.plot(kind='bar', stacked=True, title=\"Total weight by site and sex\")\n", + ">> s_plot.set_ylabel(\"Weight\")\n", + ">> s_plot.set_xlabel(\"Plot\")\n", + ">> ```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa3cc636-053b-4b92-9ed3-be349d4d1285", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "9a864b9b-918d-4d08-98b6-7a140ce679ca", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "- ernst: http://www.esapubs.org/archive/ecol/E090/118/default.htm\n", + "- figshare-ndownloader: https://ndownloader.figshare.com/files/2292172\n", + "- os-lib: https://docs.python.org/3/library/os.html\n", + "- matplotlib: https://matplotlib.org\n", + "- numpy: https://www.numpy.org/\n", + "- pandas: https://pandas.pydata.org\n", + "- pandas-plot: http://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#basic-plotting-plot\n", + "- pd-dataframe: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html\n", + "- pptd: https://figshare.com/articles/Portal_Project_Teaching_Database/1314459\n", + "- python-datastructures: https://docs.python.org/3/tutorial/datastructures.html#tuples-and-sequences\n", + "- spreadsheet-lesson5: http://www.datacarpentry.org/spreadsheet-ecology-lesson/05-exporting-data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e4c4a30-de40-4ffd-befd-9b8dda7da968", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "7c6d9e59-e673-4ae3-9ca9-0a1f5b6b7509", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 + Jaspy", + "language": "python", + "name": "jaspy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python-data/solutions/ex06_pandas_rainfall.ipynb b/python-data/solutions/ex06_pandas_rainfall.ipynb new file mode 100644 index 0000000..b5e9192 --- /dev/null +++ b/python-data/solutions/ex06_pandas_rainfall.ipynb @@ -0,0 +1,1009 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "af6d2be1-db21-4ad7-9c12-ebf4ec6b79e1", + "metadata": {}, + "source": [ + "# Pandas to read CSV data\n", + "\n", + "Let's see Pandas in action, to understand some of its power and utility..." + ] + }, + { + "cell_type": "markdown", + "id": "1be9623f-bdbb-402f-8d74-7a8ef03ac2bd", + "metadata": {}, + "source": [ + "## If we just want the first 6 rows, just use readline()\n", + "\n", + "We know there are 6 header lines, we can get those using Python's `open()` and `f.readline()`:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d179c1f2-9730-492b-b6fe-9c6fddb33c96", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Item: UK Rainfall (mm)\n", + "Item: Areal series, starting from 1910\n", + "Item: Allowances have been made for topographic, coastal and urban effects where relationships are found to exist.\n", + "Item: Seasons: Winter=Dec-Feb, Spring=Mar-May, Summer=June-Aug, Autumn=Sept-Nov. (Winter: Year refers to Jan/Feb).\n", + "Item: Values are ranked and displayed to 1 dp. Where values are equal, rankings are based in order of year descending.\n", + "Item: Data are provisional from December 2014 & Winter 2015. Last updated 07/04/2015\n" + ] + } + ], + "source": [ + "# Set the path and read metadata\n", + "fpath = \"../example_data/uk_rainfall.txt\"\n", + "with open(fpath) as f:\n", + " metadata = [f.readline().strip() for i in range(6)]\n", + " \n", + "for item in metadata:\n", + " print(\"Item:\", item)" + ] + }, + { + "cell_type": "markdown", + "id": "3294a316-66fc-4234-a226-a2ba4bf72480", + "metadata": {}, + "source": [ + "## Now let's see what Pandas can do to read the actual tabular data\n", + "\n", + "Pandas can read many formats, and stores data very efficiently. In this case we use:\n", + "\n", + "`pandas.read_csv()`\n", + "\n", + "See docs: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html\n", + "\n", + "In one call, we tell it to:\n", + "- read from file `fpath`\n", + "- skip the first 6 rows of the header (captured above)\n", + "- use a regular expression to split the fields (i.e. `\"\\s+\"` which means split on white space\n", + "- use the first column (Year) as the index\n", + "- values specified as `\"---\"` should be treated as missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3f9a4009-7ae6-450c-8a8c-ceee826d55f5", + "metadata": {}, + "outputs": [], + "source": [ + "# Read it in one line with Pandas!\n", + "\n", + "import pandas as pd\n", + "df = pd.read_csv(fpath, skiprows=6, sep=\"\\s+\",\n", + "\t\t\t index_col=0, na_values=\"---\")" + ] + }, + { + "cell_type": "markdown", + "id": "c1a49fbb-b387-4b6c-86de-95d84149cefb", + "metadata": {}, + "source": [ + "View the data as the DataFrame `df`:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "65d6658e-1cf3-452a-be82-b3d16d1fd8f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
JANFEBMARAPRMAYJUNJULAUGSEPOCTNOVDECWINSPRSUMAUTANN
Year
1910111.4126.149.995.371.870.297.1140.227.089.4128.4142.2NaN217.0307.5244.81148.9
191159.299.762.169.052.277.043.369.369.491.5141.3188.4301.0183.4189.6302.21022.4
1912111.779.5128.236.158.2124.592.3167.657.1116.2106.9163.7379.6222.5384.4280.31242.0
1913123.457.1131.2102.981.563.833.744.573.7103.0125.986.6344.2315.6142.1302.61027.4
191478.8114.9124.352.359.652.594.480.157.261.8139.3203.3280.3236.3227.0258.31118.6
......................................................
201079.774.879.448.039.038.6107.697.6114.0101.1123.247.5255.4166.4243.8338.3950.5
2011102.8114.549.736.7101.885.176.1105.8108.5122.8100.6168.1264.9188.2266.9331.91172.5
2012110.960.037.0128.065.8149.0118.9111.3112.9126.2135.5179.4339.0230.8379.2374.61334.8
2013110.559.864.663.692.148.765.472.771.6163.491.0187.6349.6220.2186.8326.01091.0
2014188.0169.280.067.899.654.864.7138.822.8158.5123.7129.7544.9247.4258.3304.91297.6
\n", + "

105 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " JAN FEB MAR APR MAY JUN JUL AUG SEP OCT \\\n", + "Year \n", + "1910 111.4 126.1 49.9 95.3 71.8 70.2 97.1 140.2 27.0 89.4 \n", + "1911 59.2 99.7 62.1 69.0 52.2 77.0 43.3 69.3 69.4 91.5 \n", + "1912 111.7 79.5 128.2 36.1 58.2 124.5 92.3 167.6 57.1 116.2 \n", + "1913 123.4 57.1 131.2 102.9 81.5 63.8 33.7 44.5 73.7 103.0 \n", + "1914 78.8 114.9 124.3 52.3 59.6 52.5 94.4 80.1 57.2 61.8 \n", + "... ... ... ... ... ... ... ... ... ... ... \n", + "2010 79.7 74.8 79.4 48.0 39.0 38.6 107.6 97.6 114.0 101.1 \n", + "2011 102.8 114.5 49.7 36.7 101.8 85.1 76.1 105.8 108.5 122.8 \n", + "2012 110.9 60.0 37.0 128.0 65.8 149.0 118.9 111.3 112.9 126.2 \n", + "2013 110.5 59.8 64.6 63.6 92.1 48.7 65.4 72.7 71.6 163.4 \n", + "2014 188.0 169.2 80.0 67.8 99.6 54.8 64.7 138.8 22.8 158.5 \n", + "\n", + " NOV DEC WIN SPR SUM AUT ANN \n", + "Year \n", + "1910 128.4 142.2 NaN 217.0 307.5 244.8 1148.9 \n", + "1911 141.3 188.4 301.0 183.4 189.6 302.2 1022.4 \n", + "1912 106.9 163.7 379.6 222.5 384.4 280.3 1242.0 \n", + "1913 125.9 86.6 344.2 315.6 142.1 302.6 1027.4 \n", + "1914 139.3 203.3 280.3 236.3 227.0 258.3 1118.6 \n", + "... ... ... ... ... ... ... ... \n", + "2010 123.2 47.5 255.4 166.4 243.8 338.3 950.5 \n", + "2011 100.6 168.1 264.9 188.2 266.9 331.9 1172.5 \n", + "2012 135.5 179.4 339.0 230.8 379.2 374.6 1334.8 \n", + "2013 91.0 187.6 349.6 220.2 186.8 326.0 1091.0 \n", + "2014 123.7 129.7 544.9 247.4 258.3 304.9 1297.6 \n", + "\n", + "[105 rows x 17 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "be40d4ee-62b9-40f4-b17f-e511d46c1ba8", + "metadata": {}, + "source": [ + "Use some of the built-in Pandas DataFrame functions:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2c32430e-9fdd-4e00-8aef-4bdb18883b14", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['JAN',\n", + " 'FEB',\n", + " 'MAR',\n", + " 'APR',\n", + " 'MAY',\n", + " 'JUN',\n", + " 'JUL',\n", + " 'AUG',\n", + " 'SEP',\n", + " 'OCT',\n", + " 'NOV',\n", + " 'DEC',\n", + " 'WIN',\n", + " 'SPR',\n", + " 'SUM',\n", + " 'AUT',\n", + " 'ANN']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the column names as a list\n", + "df.columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f5f86eb3-1b11-4d0c-888a-07a6d66bd634", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Year'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find out which field is the index\n", + "df.index.name" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "af1f7540-2a52-4db0-bc98-2f7c20f82388", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
JANFEBMARAPRMAYJUNJULAUGSEPOCTNOVDECWINSPRSUMAUTANN
count105.000000105.000000105.000000105.000000105.000000105.000000105.000000105.000000105.000000105.000000105.000000105.000000104.000000105.000000105.000000105.000000105.000000
mean113.30857183.38761978.65333368.53523869.38476269.09619082.01238192.47714391.883810113.569524114.699048118.287619314.483654216.570476243.580000320.1533331095.295238
std35.85673737.53393528.33396723.73422524.05508026.22927727.96689633.31835733.72433838.15677038.53817939.01311671.43750842.20788156.87460365.079506108.620457
min29.0000008.80000017.00000014.10000022.00000014.20000030.60000010.30000022.80000031.60000022.10000032.400000119.500000119.800000103.000000193.400000835.400000
25%93.30000058.00000057.20000050.00000051.70000050.40000061.90000073.90000069.40000089.40000086.10000088.600000259.825000189.400000205.700000268.1000001025.600000
50%111.40000079.50000075.50000069.00000066.40000064.90000078.50000090.30000092.000000115.300000112.800000115.500000309.450000212.700000240.800000321.5000001091.000000
75%137.000000106.30000093.50000087.20000086.70000086.90000098.500000112.900000113.700000142.300000143.000000142.200000356.900000237.300000280.100000368.5000001177.700000
max205.100000187.900000150.300000128.000000128.300000149.000000145.600000167.600000182.200000194.800000215.700000213.000000544.900000331.700000384.400000497.8000001337.300000
\n", + "
" + ], + "text/plain": [ + " JAN FEB MAR APR MAY JUN \\\n", + "count 105.000000 105.000000 105.000000 105.000000 105.000000 105.000000 \n", + "mean 113.308571 83.387619 78.653333 68.535238 69.384762 69.096190 \n", + "std 35.856737 37.533935 28.333967 23.734225 24.055080 26.229277 \n", + "min 29.000000 8.800000 17.000000 14.100000 22.000000 14.200000 \n", + "25% 93.300000 58.000000 57.200000 50.000000 51.700000 50.400000 \n", + "50% 111.400000 79.500000 75.500000 69.000000 66.400000 64.900000 \n", + "75% 137.000000 106.300000 93.500000 87.200000 86.700000 86.900000 \n", + "max 205.100000 187.900000 150.300000 128.000000 128.300000 149.000000 \n", + "\n", + " JUL AUG SEP OCT NOV DEC \\\n", + "count 105.000000 105.000000 105.000000 105.000000 105.000000 105.000000 \n", + "mean 82.012381 92.477143 91.883810 113.569524 114.699048 118.287619 \n", + "std 27.966896 33.318357 33.724338 38.156770 38.538179 39.013116 \n", + "min 30.600000 10.300000 22.800000 31.600000 22.100000 32.400000 \n", + "25% 61.900000 73.900000 69.400000 89.400000 86.100000 88.600000 \n", + "50% 78.500000 90.300000 92.000000 115.300000 112.800000 115.500000 \n", + "75% 98.500000 112.900000 113.700000 142.300000 143.000000 142.200000 \n", + "max 145.600000 167.600000 182.200000 194.800000 215.700000 213.000000 \n", + "\n", + " WIN SPR SUM AUT ANN \n", + "count 104.000000 105.000000 105.000000 105.000000 105.000000 \n", + "mean 314.483654 216.570476 243.580000 320.153333 1095.295238 \n", + "std 71.437508 42.207881 56.874603 65.079506 108.620457 \n", + "min 119.500000 119.800000 103.000000 193.400000 835.400000 \n", + "25% 259.825000 189.400000 205.700000 268.100000 1025.600000 \n", + "50% 309.450000 212.700000 240.800000 321.500000 1091.000000 \n", + "75% 356.900000 237.300000 280.100000 368.500000 1177.700000 \n", + "max 544.900000 331.700000 384.400000 497.800000 1337.300000 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Show the spread of each column\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3848706b-6fef-417b-8b74-73b0615a1744", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 105 entries, 1910 to 2014\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 JAN 105 non-null float64\n", + " 1 FEB 105 non-null float64\n", + " 2 MAR 105 non-null float64\n", + " 3 APR 105 non-null float64\n", + " 4 MAY 105 non-null float64\n", + " 5 JUN 105 non-null float64\n", + " 6 JUL 105 non-null float64\n", + " 7 AUG 105 non-null float64\n", + " 8 SEP 105 non-null float64\n", + " 9 OCT 105 non-null float64\n", + " 10 NOV 105 non-null float64\n", + " 11 DEC 105 non-null float64\n", + " 12 WIN 104 non-null float64\n", + " 13 SPR 105 non-null float64\n", + " 14 SUM 105 non-null float64\n", + " 15 AUT 105 non-null float64\n", + " 16 ANN 105 non-null float64\n", + "dtypes: float64(17)\n", + "memory usage: 14.8 KB\n" + ] + } + ], + "source": [ + "# Get information about the overall DataFrame\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "766af082-3c2a-43a6-9abd-025d10f1fcdf", + "metadata": {}, + "source": [ + "You can use `df.loc` to locate specific columns/rows. In this case we select WINTER in 1918." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0b0446c6-7a03-4aef-9937-cf0dffb8f829", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "272.9" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[1918, \"WIN\"]" + ] + }, + { + "cell_type": "markdown", + "id": "6698a4be-edfd-461f-a98e-59f3cc7a6b02", + "metadata": {}, + "source": [ + "Whereas `df.iloc` uses the index lookups, here is the same lookup via indexes:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2305918e-e01a-4c37-b1ce-ad4861ee7894", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "272.9" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[8, 12]" + ] + }, + { + "cell_type": "markdown", + "id": "3ab2aa8e-45b3-4560-bf68-39f3051f5369", + "metadata": {}, + "source": [ + "And you can slice as well as select individual values:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "898d499d-8832-4f7e-b5f4-6fca4f5eabbf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AUTANN
Year
1910244.81148.9
1911302.21022.4
1912280.31242.0
1913302.61027.4
\n", + "
" + ], + "text/plain": [ + " AUT ANN\n", + "Year \n", + "1910 244.8 1148.9\n", + "1911 302.2 1022.4\n", + "1912 280.3 1242.0\n", + "1913 302.6 1027.4" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[:4, -2:]" + ] + }, + { + "cell_type": "markdown", + "id": "3852d509-3643-48ed-a753-4c1fd92267b9", + "metadata": {}, + "source": [ + "## Plotting\n", + "\n", + "Plotting is easy, just call the `plot()` method on the particular DataFrame or Series you are looking at." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "97109723-37d2-48d7-b383-a5de0956c740", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df.MAR.plot(title=\"Time-series of rainfall in March\", ylabel=\"Rainfall (mm)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26f05680-2ed6-4736-8b62-9c624db22dd4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 + Jaspy", + "language": "python", + "name": "jaspy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}