1.2. Pandas and Polars#

import pandas as pd
import sklearn
import warnings

warnings.simplefilter("ignore", FutureWarning)
from sklearn import set_config

set_config(transform_output="pandas")

Pandas

Creating DataFrames with Pandas

List to DataFrame

data = [10, 20, 30, 40, 50, 60]
# create the pandas DataFrame with column name is provided explicitly
df = pd.DataFrame(data, columns=["Numbers"])
# initialize list of lists
data = [["tom", 10], ["nick", 15], ["juli", 14]]

# Create a pandas DataFrame
df = pd.DataFrame(data, columns=["Name", "Age"])

# print dataframe
df
Name Age
0 tom 10
1 nick 15
2 juli 14

Dictionary to DataFrame

# initialize data of lists
data = {"Name": ["Tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18]}

# create DataFrame
df = pd.DataFrame(data)

# print the output
df
Name Age
0 Tom 20
1 nick 21
2 krish 19
3 jack 18
# initialize data of lists
data = {"Name": ["Tom", "Jack", "nick", "juli"], "marks": [99, 98, 95, 90]}

# creates pandas DataFrame
df = pd.DataFrame(data, index=["row1", "row2", "row3", "row4"])
# print the data
df
Name marks
row1 Tom 99
row2 Jack 98
row3 nick 95
row4 juli 90

Rows to DataFrame

# initialize data to lists
data = [{"a": 1, "b": 2, "c": 3}, {"a": 10, "b": 20, "c": 30}]

# creates DataFrame
df = pd.DataFrame(data)

# print the data
df
a b c
0 1 2 3
1 10 20 30

Using a Zipper

# list 1
Name = ["tom", "krish", "nick", "juli"]

# list 2
Age = [25, 30, 26, 22]

# get the list of tuples from two lists
# and merge them by using zip()
list_of_tuples = list(zip(Name, Age))

# assign data to tuples
print(list_of_tuples)

# converting lists of tuples
# into a pandas Dataframe
df = pd.DataFrame(list_of_tuples, columns=["Name", "Age"])

# print data
df
[('tom', 25), ('krish', 30), ('nick', 26), ('juli', 22)]
Name Age
0 tom 25
1 krish 30
2 nick 26
3 juli 22

Using Series

# initialize data to series
d = pd.Series([10, 20, 30, 40])
# create DataFrame
df = pd.DataFrame(d, columns=["Numbers"])
# print the data
df
Numbers
0 10
1 20
2 30
3 40
Performing look-ups with Pandas
df = pd.DataFrame(
    {
        "Rating": ["A", "B", "C", "D", "E", "F", "G"],
        "DefaultRate": [0.0003, 0.022, 0.0543, 0.0828, 0.0927, 0.1105, 0.16],
    }
)
def map_rating_to_quality(rating):
    rating_map = {
        "A": "Prime",
        "B": "Good",
        "C": "Fair",
        "D": "Subprime",
    }
    return rating_map.get(rating, "High-risk")


df["Quality"] = df["Rating"].apply(map_rating_to_quality)
df
Rating DefaultRate Quality
0 A 0.0003 Prime
1 B 0.0220 Good
2 C 0.0543 Fair
3 D 0.0828 Subprime
4 E 0.0927 High-risk
5 F 0.1105 High-risk
6 G 0.1600 High-risk
df.applymap(lambda x: len(str(x)))
Rating DefaultRate Quality
0 1 6 5
1 1 5 4
2 1 6 4
3 1 6 8
4 1 6 9
5 1 6 9
6 1 4 9
Grouping
import random

from sklearn.datasets import make_classification

# Generate synthetic data
n_samples = 1000
n_classes = 2
random_state = 42

X, y = make_classification(
    n_samples=n_samples,
    n_classes=n_classes,
    weights=[0.95, 0.05],
    random_state=random_state,
)

# Create a DataFrame
data = {
    "purpose": [
        random.choice(["Credit Card", "Consumer Loan", "BNPL"])
        for _ in range(n_samples)
    ],
    "default": y,
}

df = pd.DataFrame(data)
# view top 5 rows
df.head(n=5)
purpose default
0 BNPL 0
1 BNPL 0
2 Credit Card 0
3 Credit Card 0
4 Consumer Loan 0
df.groupby("purpose").agg(
    n_of_loans=("default", "count"),
    default_rate=("default", "mean"),
).reset_index()
purpose n_of_loans default_rate
0 BNPL 340 0.047059
1 Consumer Loan 334 0.050898
2 Credit Card 326 0.061350

Polars

import polars as pl
polars_data = pl.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})
polars_data
shape: (5, 2)
AB
i64i64
16
27
38
49
510
from sklearn.datasets import load_iris

iris = load_iris()
# Pandas DataFrame
iris_pandas = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# Polars DataFrame
iris_polars = pl.DataFrame(iris.data, schema=iris.feature_names)
# From Fisher's Design of Experiments
iris_polars.filter(
    (pl.col("petal length (cm)") < 5) & (pl.col("sepal length (cm)") > 3)
)[0:3]
shape: (3, 4)
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
f64f64f64f64
5.13.51.40.2
4.93.01.40.2
4.73.21.30.2