### Pandas and Polars

In [2]:
import pandas as pd
import sklearn

In [3]:
import warnings

warnings.simplefilter("ignore", FutureWarning)

In [4]:
from sklearn import set_config

set_config(transform_output="pandas")

<img src="https://pandas.pydata.org/static/img/pandas.svg" width="250">

<h4>Pandas</h4>

<h5>Creating DataFrames with Pandas</h5>

**List to DataFrame**

In [5]:
data = [10, 20, 30, 40, 50, 60]
# create the pandas DataFrame with column name is provided explicitly
df = pd.DataFrame(data, columns=["Numbers"])

In [6]:
# initialize list of lists
data = [["tom", 10], ["nick", 15], ["juli", 14]]

# Create a pandas DataFrame
df = pd.DataFrame(data, columns=["Name", "Age"])

# print dataframe
df

Unnamed: 0,Name,Age
0,tom,10
1,nick,15
2,juli,14


**Dictionary to DataFrame**

In [7]:
# initialize data of lists
data = {"Name": ["Tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18]}

# create DataFrame
df = pd.DataFrame(data)

# print the output
df

Unnamed: 0,Name,Age
0,Tom,20
1,nick,21
2,krish,19
3,jack,18


In [8]:
# initialize data of lists
data = {"Name": ["Tom", "Jack", "nick", "juli"], "marks": [99, 98, 95, 90]}

# creates pandas DataFrame
df = pd.DataFrame(data, index=["row1", "row2", "row3", "row4"])
# print the data
df

Unnamed: 0,Name,marks
row1,Tom,99
row2,Jack,98
row3,nick,95
row4,juli,90


**Rows to DataFrame**

In [9]:
# initialize data to lists
data = [{"a": 1, "b": 2, "c": 3}, {"a": 10, "b": 20, "c": 30}]

# creates DataFrame
df = pd.DataFrame(data)

# print the data
df

Unnamed: 0,a,b,c
0,1,2,3
1,10,20,30


**Using a Zipper**

In [10]:
# list 1
Name = ["tom", "krish", "nick", "juli"]

# list 2
Age = [25, 30, 26, 22]

# get the list of tuples from two lists
# and merge them by using zip()
list_of_tuples = list(zip(Name, Age))

# assign data to tuples
print(list_of_tuples)

# converting lists of tuples
# into a pandas Dataframe
df = pd.DataFrame(list_of_tuples, columns=["Name", "Age"])

# print data
df

[('tom', 25), ('krish', 30), ('nick', 26), ('juli', 22)]


Unnamed: 0,Name,Age
0,tom,25
1,krish,30
2,nick,26
3,juli,22


**Using Series**

In [11]:
# initialize data to series
d = pd.Series([10, 20, 30, 40])
# create DataFrame
df = pd.DataFrame(d, columns=["Numbers"])
# print the data
df

Unnamed: 0,Numbers
0,10
1,20
2,30
3,40


<h5>Performing look-ups with Pandas</h5>

In [12]:
df = pd.DataFrame(
    {
        "Rating": ["A", "B", "C", "D", "E", "F", "G"],
        "DefaultRate": [0.0003, 0.022, 0.0543, 0.0828, 0.0927, 0.1105, 0.16],
    }
)

In [13]:
def map_rating_to_quality(rating):
    rating_map = {
        "A": "Prime",
        "B": "Good",
        "C": "Fair",
        "D": "Subprime",
    }
    return rating_map.get(rating, "High-risk")


df["Quality"] = df["Rating"].apply(map_rating_to_quality)
df

Unnamed: 0,Rating,DefaultRate,Quality
0,A,0.0003,Prime
1,B,0.022,Good
2,C,0.0543,Fair
3,D,0.0828,Subprime
4,E,0.0927,High-risk
5,F,0.1105,High-risk
6,G,0.16,High-risk


In [14]:
df.applymap(lambda x: len(str(x)))

Unnamed: 0,Rating,DefaultRate,Quality
0,1,6,5
1,1,5,4
2,1,6,4
3,1,6,8
4,1,6,9
5,1,6,9
6,1,4,9


<h5>Grouping</h5>

In [20]:
import random

from sklearn.datasets import make_classification

# Generate synthetic data
n_samples = 1000
n_classes = 2
random_state = 42

X, y = make_classification(
    n_samples=n_samples,
    n_classes=n_classes,
    weights=[0.95, 0.05],
    random_state=random_state,
)

# Create a DataFrame
data = {
    "purpose": [
        random.choice(["Credit Card", "Consumer Loan", "BNPL"])
        for _ in range(n_samples)
    ],
    "default": y,
}

df = pd.DataFrame(data)
# view top 5 rows
df.head(n=5)

Unnamed: 0,purpose,default
0,BNPL,0
1,BNPL,0
2,Credit Card,0
3,Credit Card,0
4,Consumer Loan,0


In [21]:
df.groupby("purpose").agg(
    n_of_loans=("default", "count"),
    default_rate=("default", "mean"),
).reset_index()

Unnamed: 0,purpose,n_of_loans,default_rate
0,BNPL,340,0.047059
1,Consumer Loan,334,0.050898
2,Credit Card,326,0.06135


<img src="https://raw.githubusercontent.com/pola-rs/polars-static/master/logos/polars_github_logo_rect_dark_name.svg" width="250">

<h4>Polars</h4>

In [22]:
import polars as pl

In [23]:
polars_data = pl.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})

In [24]:
polars_data

A,B
i64,i64
1,6
2,7
3,8
4,9
5,10


In [25]:
from sklearn.datasets import load_iris

iris = load_iris()
# Pandas DataFrame
iris_pandas = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# Polars DataFrame
iris_polars = pl.DataFrame(iris.data, schema=iris.feature_names)

In [26]:
# From Fisher's Design of Experiments
iris_polars.filter(
    (pl.col("petal length (cm)") < 5) & (pl.col("sepal length (cm)") > 3)
)[0:3]

sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
f64,f64,f64,f64
5.1,3.5,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,1.3,0.2
