빅데이터분석기사 실기 시험을 준비하면서, NumPy와 Pandas에서 자주 쓰는 문법들을 요약해 외우고자 cheatsheet처럼 정리해보았습니다. 문법과 그에 따른 결과를 최대한 한 줄로 보기 쉽게 만들었습니다.
NumPy
import numpy as np
a1 = np.array([[1, 2, 3], [4, 5, 6]])
print(a1) # [[1 2 3] [4 5 6]]
print(a1.shape) # (2, 3)
print(a1.ndim) # 2
## Array
a2 = np.arange(1, 20, 3) # [1 4 7 10 13 16 19]
a3 = np.zeros((2, 3)) # [[0. 0. 0.] [0. 0. 0.]]
a4 = np.ones((3, 2)) # [[1. 1.] [1. 1.] [1. 1.]]
a5 = np.zeros_like(a4) # [[0. 0.] [0. 0.] [0. 0.]]
a6 = np.full((2, 3), 9) # [[9 9 9] [9 9 9]]
a7 = np.random.random((3, 2)) # [[0.8 0.2] [0.7 0.5] [0.7 0.1]]
a8 = np.eye(4) # [[1. 0. 0.] [0. 1. 0.] [0. 0. 1.]]
## Shape
a9 = np.arange(6).reshape(2, 3) # [[0 1 2] [3 4 5]]
a10 = np.arange(6).reshape(-1, 2) # [[0 1] [2 3] [4 5]]
a11 = np.dot(a9, a10) # [[10 13] [28 40]]
a12 = a11.transpose() # [[10 28] [13 40]]
a13 = a11.flatten() # [10 13 28 40]
## Stats
print(len(a13), np.mean(a13), np.var(a13), np.std(a13))
# 4 22.75 145.688 12.07010770457331
print(np.max(a13), np.min(a13), np.median(a13), np.percentile(a13, 75))
# 40 10 20.5 31.0
Pandas
import pandas as pd
s = pd.Series([1, 3, 5, 6])
print(s.index) # RangeIndex(start=0, stop=4, step=1)
print(s.values) # [1 3 5 6]
## DataFrame
df1 = pd.DataFrame(
[[1, 2], [3, 4], [5, 6]], index=["r1", "r2", "r3"], columns=["c1", "c2"]
)
df2 = pd.DataFrame({"c1": [1, 3, 5], "c2": [2, 4, 6]}, index=["r1", "r2", "r3"])
print(df1)
print(df2)
"""
c1 c2 c1 c2
r1 1 2 r1 1 2
r2 3 4 r2 3 4
r3 5 6 r3 5 6
"""
## CSV
df3 = pd.read_csv("file_name.csv", header=None)
df3.to_csv("file_name.tsv", index=False, sep="\t")
## Summary
print(df1.shape) # (3, 2)
print(df1.head(2)) # head rows
print(df1.info()) # columns, non-null counts, data types, memory usages
print(df1.describe()) # count, mean, std, min, max, percentiles
print(df1["c2"].value_counts()) # {2: 1, 4: 1, 6: 1}
## Stats
print(df1.sum()) # {"c1": 9, "c2": 12}
print(df1.count()) # {"c1": 3, "c2": 3}
print(df1.min()) # {"c1": 1, "c2": 2}
print(df1.max()) # {"c1": 5, "c2": 6}
print(df1.mean()) # {"c1": 3.0, "c2": 4.0}
print(df1.var()) # {"c1": 4.0, "c2": 4.0}
print(df1.std()) # {"c1": 2.0, "c2": 2.0}
print(df1.quantile(0.25)) # {"c1": 2.0, "c2": 3.0}
print(df1.corr())
"""
c1 c2
c1 1.0 1.0
c2 1.0 1.0
"""
## GroupBy
df4 = pd.DataFrame(
{"c1": ["a", "a", "b"], "c2": [1, 2, 3], "c3": [4, 5, 6]},
index=["r1", "r2", "r3"],
)
print(df4.groupby("c1").mean())
"""
c2 c3
c1
a 1.5 4.5
b 3.0 6.0
"""
## Drop
print(df4.drop(["r1", "r3"]))
print(df4.drop(["c1", "c3"], axis=1))
"""
c1 c2 c3
r2 a 2 5
c2
r1 1
r2 2
r3 3
"""
## Indexing
print(df1.iloc[1, 1]) # 4
print(df1.loc["r2", "c2"]) # 4
print(df1[df1["c2"] == 4])
print(df1[(df1["c2"] > 2) & (df1["c1"] < 4)])
print(df1.sort_values(by="c2", ascending=False))
"""
c1 c2 c1 c2
r2 3 4 r2 3 4
c1 c2
r3 5 6
r2 3 4
r1 1 2
"""
## NaN
df1["c3"] = np.nan
print(df1.isna())
print(df1.fillna(df1["c2"].mean()))
"""
c1 c2 c3
r1 False False True
r2 False False True
r3 False False True
c1 c2 c4
r1 1 2 4.0
r2 3 4 4.0
r3 5 6 4.0
"""
## Concat
print(pd.concat([df1, df2]))
print(pd.concat([df1, df2], axis=1))
"""
c1 c2
r1 1 2
r2 3 4
r3 5 6
r1 1 2
r2 3 4
r3 5 6
c1 c2 c1 c2
r1 1 2 1 2
r2 3 4 3 4
r3 5 6 5 6
"""
'개발 > 파이썬' 카테고리의 다른 글
일급객체란? (0) | 2022.09.13 |
---|---|
Python logging & logger 모듈 (0) | 2022.09.13 |
Pyreverse, Python 다이어그램 diagram 패키지 (0) | 2022.09.13 |
Radon, Python code metrics 평가 도구 (0) | 2022.09.13 |
댓글