1.2. Pandas and Polars#
import pandas as pd
import sklearn
import warnings
warnings.simplefilter("ignore", FutureWarning)
from sklearn import set_config
set_config(transform_output="pandas")
Pandas
Creating DataFrames with Pandas
List to DataFrame
data = [10, 20, 30, 40, 50, 60]
# create the pandas DataFrame with column name is provided explicitly
df = pd.DataFrame(data, columns=["Numbers"])
# initialize list of lists
data = [["tom", 10], ["nick", 15], ["juli", 14]]
# Create a pandas DataFrame
df = pd.DataFrame(data, columns=["Name", "Age"])
# print dataframe
df
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 14 |
Dictionary to DataFrame
# initialize data of lists
data = {"Name": ["Tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18]}
# create DataFrame
df = pd.DataFrame(data)
# print the output
df
Name | Age | |
---|---|---|
0 | Tom | 20 |
1 | nick | 21 |
2 | krish | 19 |
3 | jack | 18 |
# initialize data of lists
data = {"Name": ["Tom", "Jack", "nick", "juli"], "marks": [99, 98, 95, 90]}
# creates pandas DataFrame
df = pd.DataFrame(data, index=["row1", "row2", "row3", "row4"])
# print the data
df
Name | marks | |
---|---|---|
row1 | Tom | 99 |
row2 | Jack | 98 |
row3 | nick | 95 |
row4 | juli | 90 |
Rows to DataFrame
# initialize data to lists
data = [{"a": 1, "b": 2, "c": 3}, {"a": 10, "b": 20, "c": 30}]
# creates DataFrame
df = pd.DataFrame(data)
# print the data
df
a | b | c | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 10 | 20 | 30 |
Using a Zipper
# list 1
Name = ["tom", "krish", "nick", "juli"]
# list 2
Age = [25, 30, 26, 22]
# get the list of tuples from two lists
# and merge them by using zip()
list_of_tuples = list(zip(Name, Age))
# assign data to tuples
print(list_of_tuples)
# converting lists of tuples
# into a pandas Dataframe
df = pd.DataFrame(list_of_tuples, columns=["Name", "Age"])
# print data
df
[('tom', 25), ('krish', 30), ('nick', 26), ('juli', 22)]
Name | Age | |
---|---|---|
0 | tom | 25 |
1 | krish | 30 |
2 | nick | 26 |
3 | juli | 22 |
Using Series
# initialize data to series
d = pd.Series([10, 20, 30, 40])
# create DataFrame
df = pd.DataFrame(d, columns=["Numbers"])
# print the data
df
Numbers | |
---|---|
0 | 10 |
1 | 20 |
2 | 30 |
3 | 40 |
Performing look-ups with Pandas
df = pd.DataFrame(
{
"Rating": ["A", "B", "C", "D", "E", "F", "G"],
"DefaultRate": [0.0003, 0.022, 0.0543, 0.0828, 0.0927, 0.1105, 0.16],
}
)
def map_rating_to_quality(rating):
rating_map = {
"A": "Prime",
"B": "Good",
"C": "Fair",
"D": "Subprime",
}
return rating_map.get(rating, "High-risk")
df["Quality"] = df["Rating"].apply(map_rating_to_quality)
df
Rating | DefaultRate | Quality | |
---|---|---|---|
0 | A | 0.0003 | Prime |
1 | B | 0.0220 | Good |
2 | C | 0.0543 | Fair |
3 | D | 0.0828 | Subprime |
4 | E | 0.0927 | High-risk |
5 | F | 0.1105 | High-risk |
6 | G | 0.1600 | High-risk |
df.applymap(lambda x: len(str(x)))
Rating | DefaultRate | Quality | |
---|---|---|---|
0 | 1 | 6 | 5 |
1 | 1 | 5 | 4 |
2 | 1 | 6 | 4 |
3 | 1 | 6 | 8 |
4 | 1 | 6 | 9 |
5 | 1 | 6 | 9 |
6 | 1 | 4 | 9 |
Grouping
import random
from sklearn.datasets import make_classification
# Generate synthetic data
n_samples = 1000
n_classes = 2
random_state = 42
X, y = make_classification(
n_samples=n_samples,
n_classes=n_classes,
weights=[0.95, 0.05],
random_state=random_state,
)
# Create a DataFrame
data = {
"purpose": [
random.choice(["Credit Card", "Consumer Loan", "BNPL"])
for _ in range(n_samples)
],
"default": y,
}
df = pd.DataFrame(data)
# view top 5 rows
df.head(n=5)
purpose | default | |
---|---|---|
0 | BNPL | 0 |
1 | BNPL | 0 |
2 | Credit Card | 0 |
3 | Credit Card | 0 |
4 | Consumer Loan | 0 |
df.groupby("purpose").agg(
n_of_loans=("default", "count"),
default_rate=("default", "mean"),
).reset_index()
purpose | n_of_loans | default_rate | |
---|---|---|---|
0 | BNPL | 340 | 0.047059 |
1 | Consumer Loan | 334 | 0.050898 |
2 | Credit Card | 326 | 0.061350 |
Polars
import polars as pl
polars_data = pl.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})
polars_data
shape: (5, 2)
A | B |
---|---|
i64 | i64 |
1 | 6 |
2 | 7 |
3 | 8 |
4 | 9 |
5 | 10 |
from sklearn.datasets import load_iris
iris = load_iris()
# Pandas DataFrame
iris_pandas = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# Polars DataFrame
iris_polars = pl.DataFrame(iris.data, schema=iris.feature_names)
# From Fisher's Design of Experiments
iris_polars.filter(
(pl.col("petal length (cm)") < 5) & (pl.col("sepal length (cm)") > 3)
)[0:3]
shape: (3, 4)
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) |
---|---|---|---|
f64 | f64 | f64 | f64 |
5.1 | 3.5 | 1.4 | 0.2 |
4.9 | 3.0 | 1.4 | 0.2 |
4.7 | 3.2 | 1.3 | 0.2 |