Site icon Afzal Badshah, PhD

Data Manipulation with MongoDB Aggregation Framework in Python

MongoDB Aggregation Framework is a powerful tool that allows for data manipulation and analysis within MongoDB collections. It provides a flexible and efficient way to process and transform data, enabling users to perform complex operations such as grouping, sorting, filtering, and computing aggregate values. In this lab tutorial, we will introduce the concepts of MongoDB Aggregation Framework, provide a detailed explanation of the code, and walk through each line to understand its functionality. Visit the detailed tutorial here.

Code

import pymongo
from pymongo import MongoClient

# Connect to MongoDB
client = pymongo.MongoClient("mongodb+srv://user:pass@cluster0.ergtejf.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")

# Switch to the desired database
db = client.afzal
collection = db.test

# Insert sample data into a collection
db.collection.insert_many([
    { 'name': 'Afzal', 'age': 25, 'city': 'Islamabad' },
    { 'name': 'Jalal', 'age': 28, 'city': 'Mianwali' }
    { 'name': 'Yousaf', 'age': 30, 'city': 'Quetta' },
    { 'name': 'Ibrahim', 'age': 35, 'city': 'Karachi' },
])

print("Total documents in collection:", db.collection.count_documents({}))

# Calculate average age
pipeline_avg_age = [{ '$group': { '_id': None, 'avgAge': { '$avg': '$age' } } }]
avg_age_result = list(db.collection.aggregate(pipeline_avg_age))
print("Average age:", avg_age_result[0]['avgAge'])

# Group by city and count
pipeline_city_count = [{ '$group': { '_id': '$city', 'count': { '$sum': 1 } } }]
city_count_result = list(db.collection.aggregate(pipeline_city_count))
print("City count:", city_count_result)

# Group by city and find max age
pipeline_max_age = [{ '$group': { '_id': '$city', 'maxAge': { '$max': '$age' } } }]
max_age_result = list(db.collection.aggregate(pipeline_max_age))
print("Max age by city:", max_age_result)

# Filter documents where age is greater than 25
pipeline_filtered = [{ '$match': { 'age': { '$gt': 25 } } }]
filtered_result = list(db.collection.aggregate(pipeline_filtered))
print("Filtered documents:", filtered_result)

# Sort documents by age in descending order
pipeline_sorted = [{ '$sort': { 'age': -1 } }]
sorted_result = list(db.collection.aggregate(pipeline_sorted))
print("Sorted documents:", sorted_result)

Connection and Database Selection

import pymongo
from pymongo import MongoClient

# Connect to MongoDB (replace with your connection details)
client = pymongo.MongoClient("mongodb+srv://user:pass@cluster0.ergtejf.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")

# Switch to the desired database and collection (replace with your database and collection names)
db = client.afzal
collection = db.test

Sample Data and Counting Documents

# Insert sample data into a collection
db.collection.insert_many([
    { 'name': 'Afzal', 'age': 25, 'city': 'Islamabad' },
    { 'name': 'Jalal', 'age': 28, 'city': 'Mianwali' }
    { 'name': 'Yousaf', 'age': 30, 'city': 'Quetta' },
    { 'name': 'Ibrahim', 'age': 35, 'city': 'Karachi' },
])

print("Total documents in collection:", db.collection.count_documents({}))

Calculating Average Age

# Calculate average age
pipeline_avg_age = [{ '$group': { '_id': None, 'avgAge': { '$avg': '$age' } } }]
avg_age_result = list(db.collection.aggregate(pipeline_avg_age))
print("Average age:", avg_age_result[0]['avgAge'])

Grouping by City and Counting Documents

# Group by city and find max age
pipeline_max_age = [{ '$group': { '_id': '$city', 'maxAge': { '$max': '$age' } } }]
max_age_result = list(db.collection.aggregate(pipeline_max_age))
print("Max age by city:", max_age_result)
# Group by city and count
pipeline_city_count = [{ '$group': { '_id': '$city', 'count': { '$sum': 1 } } }]
city_count_result = list(db.collection.aggregate(pipeline_city_count))
print("City count:", city_count_result)

Grouping by City and Finding Maximum Age

pipeline_max_age = [{ '$group': { '_id': '$city', 'maxAge': { '$max': '$age' } } }]
max_age_result = list(db.collection.aggregate(pipeline_max_age))
print("Max age by city:", max_age_result)

Filtering Documents

# Filter documents where age is greater than 25
pipeline_filtered = [{ '$match': { 'age': { '$gt': 25 } } }]
filtered_result = list(db.collection.aggregate(pipeline_filtered))
print("Filtered documents:", filtered_result)

Sorting Documents

# Sort documents by age in descending order
pipeline_sorted = [{ '$sort': { 'age': -1 } }]
sorted_result = list(db.collection.aggregate(pipeline_sorted))
print("Sorted documents:", sorted_result)

Exit mobile version