Vous avez reçu un message "Your GitLab account has been locked ..." ? Pas d'inquiétude : lisez cet article https://docs.gricad-pages.univ-grenoble-alpes.fr/help/unlock/

Commit 9a52d42c authored by ramdsc's avatar ramdsc Committed by Millian Poquet
Browse files

Simpler example, initial documentation, modularized script

parent 5580eebf
# Heterogeneous platforms
## Principle
Batsim allows for the simulation of homogeneous systems with ease. However, one might need to model different kinds of nodes / processors inside a data centre or a cloud environment: distinct memory availability, processor type based scheduling, ...
Even though SimGrid does not support this, it provides user-defined properties which bring flexibility to the simulation.
## Design
Fundamental architecture approach:
![Architecture model](./doc/architecture_model.png)
- **Platform**: root element for SimGrid platform descriptions.
- **Main zone**: top-level zone with *Full* routing; interconnects the master host with the set of clusters.
- **Master zone**: contains and isolates the master host.
- **Master host**: processes communication with the decision system and job submissions.
- **Config zone**: holds user defined properties associated to node and processor types.
- **Clusters**: groups of heterogeneous nodes in the data centre.
- **Router**: provides connectivity with the master host and thus job allocation.
- **Backbone**: intra-cluster communications.
- **Nodes**: computational resources integrating one or more types of processors.
- **Processors**: computing devices, can be CPUs, GPUs, MICs or whichever defined.
- **Cores**: individual computing units inside a processor, homogeneous within the processor.
- **Memory**: amount of storage shared by all processors in the node.
Energy and computational power are expressed per core for fine grain analysis. Cores are SimGrid *hosts*, so they are considered as individual resources.
Network types apply to both the node up / down links, the cluster backbones and the global links in the main zone.
## Usage
You may define:
- Node types in `node_types.json`
- Processor types in `processor_types.json`
- Network types in `network_types.json`
Create a JSON file with the platform description, `example.json` shows a possible template.
Once integrated, feed it into the Python script for generation:
`./generate_platform.py -p <your-platform.json>`
{
"clusters": [
{
"nodes": [
{ "type": "Example Node", "number": "2" }
],
"cluster_network": "Example Network"
},
{ "nodes": [
{ "type": "Example Node", "number": "1" }
],
"cluster_network": "Example Network"
}
],
"dc_network": "Example Network"
}
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import json
import xml.etree.ElementTree as xml
import xml.dom.minidom as xml_format
import xml.etree.ElementTree as xml
def main():
description = "Generate Batsim heterogeneous platforms"
ap = argparse.ArgumentParser(description=description)
ap.add_argument("-p", "--platform-file", type=str, required=True,
help="JSON with platform description")
ap.add_argument("-o", "--output-xml", type=str, default="platform.xml",
help="XML output with platform ready for Batsim")
args = ap.parse_args()
with open(args.output_xml, "w") as output_f,\
open(args.platform_file, "r") as platform_f,\
open("network_types.json", "r") as network_types_f,\
open("node_types.json", "r") as node_types_f,\
open("processor_types.json", "r") as processor_types_f:
platform = json.load(platform_f)
network_types = json.load(network_types_f)
node_types = json.load(node_types_f)
processor_types = json.load(processor_types_f)
# DOCTYPE specification
doctype = "<!DOCTYPE platform SYSTEM \"https://simgrid.org/simgrid.dtd\">"
# Platform
platform_attrs = {"version": "4.1"}
platform_el = xml.Element("platform", attrib=platform_attrs)
# Main zone
main_zone_attrs = {"id": "main", "routing": "Full"}
main_zone_el = xml.SubElement(platform_el, "zone", attrib=main_zone_attrs)
# Master zone and master host
master_zone_attrs = {"id": "master", "routing": "None"}
master_zone_el = xml.SubElement(main_zone_el, "zone", attrib=master_zone_attrs)
mhost_attrs = {"id": "master_host", "speed": "1Gf"}
xml.SubElement(master_zone_el, "host", attrib=mhost_attrs)
# User config zone
config_zone_attrs = {"id": "config", "routing": "None"}
config_zone_el = xml.SubElement(main_zone_el, "zone", attrib=config_zone_attrs)
# Clusters
recorded_nodes = {}
cluster_idx = 0
for cluster in platform["clusters"]:
# Cluster
cluster_id = "clu_{}".format(cluster_idx)
cluster_attrs = {"id": cluster_id, "routing": "Cluster"}
cluster_el = xml.SubElement(main_zone_el, "zone", attrib=cluster_attrs)
# Nodes
for node in cluster["nodes"]:
node_template = node_types[node["type"]]
# Memory info
if node["type"] not in recorded_nodes:
mem_id = "mem_{}".format(node_template["id"])
mem_attrs = {"id": mem_id, "value": node_template["memory_gib"]}
xml.SubElement(config_zone_el, "prop", attrib=mem_attrs)
for node_idx in range(int(node["number"])):
node_id = "{}_{}_{}".format(node_template["id"], node_idx, cluster_id)
# Up / down link
"""
Transform an heterogeneous platform description into a valid Batsim XML.
"""
def cmd_args():
"""
Parses command line arguments.
"""
ap = argparse.ArgumentParser(description="Generate Batsim heterogeneous platforms")
ap.add_argument("-p", "--platform-file", type=str, required=True,
help="JSON with platform description")
ap.add_argument("-o", "--output-xml", type=str, default="platform.xml",
help="XML output with platform ready for Batsim")
return ap.parse_args()
def load_data():
"""
Loads data for user's platform, network, node and processor types.
"""
with open(args.platform_file, "r") as platform_f,\
open("network_types.json", "r") as network_types_f,\
open("node_types.json", "r") as node_types_f,\
open("processor_types.json", "r") as processor_types_f:
data = (json.load(platform_f), json.load(network_types_f),
json.load(node_types_f), json.load(processor_types_f))
return data
def generate_tree():
"""
Creates an XML tree complying to SimGrid DTD.
"""
def main_zone():
"""
Contains master zone and all clusters.
"""
return xml.SubElement(platform_xml, "zone",
attrib={"id": "main", "routing": "Full"})
def master_zone():
"""
Hosts the master node which schedules jobs onto resources.
"""
def master_host():
"""
Executes the scheduling algorithms.
"""
xml.SubElement(master_zone_xml, "host",
attrib={"id": "master_host", "speed": "1Gf"})
master_zone_xml = xml.SubElement(main_zone_xml, "zone",
attrib={"id": "master", "routing": "None"})
master_host()
def config_node():
"""
Holds user defined properties concerning node types.
"""
def config_zone():
"""
Define node and proc config properties.
"""
return xml.SubElement(main_zone_xml, "zone",
attrib={"id": "config", "routing": "None"})
return xml.SubElement(config_zone(), "zone",
attrib={"id": "node", "routing": "None"})
def clusters():
"""
Groups of nodes inside the data centre.
"""
def nodes():
"""
Systems available in the data centre, contain processors and other resources (v. gr. memory).
They are connected to a common cluster backbone by up / down links.
"""
def record_node_type():
"""
Inserts the node type in the already configured ones.
"""
if node["type"] not in recorded_nodes:
config_node_type_xml = xml.SubElement(config_node_xml, "zone",
attrib={"id": node["type"], "routing": "None"})
xml.SubElement(config_node_type_xml, "prop",
attrib={"id": "memory", "value": node_template["memory_gib"]})
recorded_nodes[node["type"]] = True
def udlink():
"""
Link between the node and the backbone.
"""
udlink_id = "udl_{}".format(node_id)
udlink_attrs = {"id": udlink_id, "sharing_policy": "SHARED"}
udlink_attrs.update(network_types[cluster["cluster_network"]])
xml.SubElement(cluster_el, "link", attrib=udlink_attrs)
# Processors
_udlink_attrs = {"id": udlink_id, "sharing_policy": "SHARED"}
_udlink_attrs.update(network_types[cluster["cluster_network"]])
xml.SubElement(cluster_xml, "link", attrib=_udlink_attrs)
return udlink_id
def procs():
"""
Computing resources available in the data centre. These can be CPUs, GPUs, MICs, ...
They have a set of cores and power consumption properties.
"""
def cores():
"""
Individual computing units inside a processor.
"""
def core_properties():
"""
Defines node type and power consumption properties.
"""
xml.SubElement(core_xml, "prop",
attrib={"id": "node_type", "value": node["type"]})
for prop in proc_template["core_properties"]:
xml.SubElement(core_xml, "prop", attrib=prop)
def link_association():
"""
Associates up / down link with the core.
"""
xml.SubElement(cluster_xml, "host_link",
attrib={"id": core_id, "up": udlink_id, "down": udlink_id})
for core_idx in range(int(proc_template["nb_cores"])):
core_id = "cor_{}_{}".format(core_idx, proc_id)
_core_attrs = {"id": core_id}
_core_attrs.update(proc_template["core_attributes"])
core_xml = xml.SubElement(cluster_xml, "host", attrib=_core_attrs)
core_properties()
link_association()
for proc in node_template["processors"]:
proc_template = processor_types[proc["type"]][proc["model"]]
for proc_idx in range(int(proc["number"])):
for proc_idx in range(int(proc_template["nb_cores"])):
proc_id = "{}_{}_{}".format(proc_template["id"], proc_idx, node_id)
# Cores
for core_idx in range(int(proc_template["nb_cores"])):
core_id = "cor_{}_{}".format(core_idx, proc_id)
core_attrs = {"id": core_id}
core_attrs.update(proc_template["core_attributes"])
core_el = xml.SubElement(cluster_el, "host", attrib=core_attrs)
# Processor type
xml.SubElement(core_el, "prop", attrib={"id": "type", "value": proc["type"]})
for prop in proc_template["core_properties"]:
xml.SubElement(core_el, "prop", attrib=prop)
# Link association
hlink_attrs = {"id": core_id, "up": udlink_id, "down": udlink_id}
xml.SubElement(cluster_el, "host_link", attrib=hlink_attrs)
# Router
router_id = "rou_{}".format(cluster_idx)
router_attrs = {"id": router_id}
xml.SubElement(cluster_el, "router", attrib=router_attrs)
# Backbone network
backbone_id = "bbo_{}".format(cluster_idx)
backbone_attrs = {"id": backbone_id}
backbone_attrs.update(network_types[cluster["cluster_network"]])
xml.SubElement(cluster_el, "backbone", attrib=backbone_attrs)
cluster_idx += 1
# Links from clusters to master host
# Required to be stated after all clusters have been configured
cluster_idx = 0
for cluster in platform["clusters"]:
cluster_id = "clu_{}".format(cluster_idx)
ctmhlink_id = "tomh_{}".format(cluster_id)
ctmhlink_attrs = {"id": ctmhlink_id}
ctmhlink_attrs.update(network_types[platform["dc_network"]])
xml.SubElement(main_zone_el, "link", attrib=ctmhlink_attrs)
cluster_idx += 1
# Routes to master zone
# Required to be stated after all links have been configured
cluster_idx = 0
for cluster in platform["clusters"]:
cluster_id = "clu_{}".format(cluster_idx)
router_id = "rou_{}".format(cluster_idx)
ctmhlink_id = "tomh_{}".format(cluster_id)
route_attrs = {"src": cluster_id, "dst": "master", "gw_src": router_id, "gw_dst": "master_host"}
route_el = xml.SubElement(main_zone_el, "zoneRoute", attrib=route_attrs)
# Link association inside route
xml.SubElement(route_el, "link_ctn", attrib={"id": ctmhlink_id})
cluster_idx += 1
# Write the output
output_xml = xml_format.parseString("{}{}".format(doctype, xml.tostring(platform_el).decode()))
output_f.write(output_xml.toprettyxml(indent=" ", encoding="utf-8").decode())
cores()
for node in cluster["nodes"]:
node_template = node_types[node["type"]]
record_node_type()
for node_idx in range(int(node["number"])):
node_id = "{}_{}_{}".format(node_template["id"], node_idx, cluster_id)
udlink_id = udlink()
procs()
def router():
"""
Gateway for inter-cluster connections.
"""
xml.SubElement(cluster_xml, "router",
attrib={"id": "rou_{}".format(cluster_idx)})
def backbone():
"""
Intra-cluster connections.
"""
_backbone_attrs = {"id": "bbo_{}".format(cluster_idx)}
_backbone_attrs.update(network_types[cluster["cluster_network"]])
xml.SubElement(cluster_xml, "backbone", attrib=_backbone_attrs)
cluster_idx = 0
recorded_nodes = {}
for cluster in platform["clusters"]:
cluster_id = "clu_{}".format(cluster_idx)
cluster_xml = xml.SubElement(main_zone_xml, "zone",
attrib={"id": cluster_id, "routing": "Cluster"})
nodes()
router()
backbone()
cluster_idx += 1
def global_links():
"""
Links from clusters to the master zone.
"""
for cluster_idx in range(len(platform["clusters"])):
_global_link_attrs = {"id": "tomh_clu_{}".format(cluster_idx)}
_global_link_attrs.update(network_types[platform["dc_network"]])
xml.SubElement(main_zone_xml, "link", attrib=_global_link_attrs)
def routes():
"""
Routes over global links.
"""
for cluster_idx in range(len(platform["clusters"])):
route_xml = xml.SubElement(main_zone_xml, "zoneRoute",
attrib={"src": "clu_{}".format(cluster_idx), "dst": "master",
"gw_src": "rou_{}".format(cluster_idx), "gw_dst": "master_host"})
xml.SubElement(route_xml, "link_ctn",
attrib={"id": "tomh_clu_{}".format(cluster_idx)})
platform_xml = xml.Element("platform",
attrib={"version": "4.1"})
main_zone_xml = main_zone()
master_zone()
config_node_xml = config_node()
clusters()
global_links()
routes()
return platform_xml
def write_result():
"""
Writes the Batsim formatted platform.
"""
def doctype():
"""
Provides SimGrid doctype.
"""
return "<!DOCTYPE platform SYSTEM \"https://simgrid.org/simgrid.dtd\">"
with open(args.output_xml, "w", ) as output_f:
output_f.write(xml_format.parseString("{}{}".format(doctype(),
xml.tostring(xml_tree).decode())).toprettyxml(indent=" ",
encoding="utf-8").decode())
# Command line arguments
args = cmd_args()
# User's defined platform and type data
platform, network_types, node_types, processor_types = load_data()
# Resulting XML tree
xml_tree = generate_tree()
# Write result to the output file
write_result()
if __name__ == "__main__":
main()
{
"Example Network": {
"bandwidth": "10.0Gbps",
"latency": "0.5us"
},
"InfiniBand FDR x4": {
"bandwidth": "54.54Gbps",
"latency": "0.7us"
......
{
"Example Node": {
"id": "example_node",
"processors": [
{ "type": "CPU", "model": "Example CPU", "number": "2" },
{ "type": "GPU", "model": "Example GPU", "number": "1" }
],
"memory_gib": "128"
},
"System x iDataPlex dx360 M4": {
"id": "dx360m4",
"processors": [
{ "type": "cpu", "model": "Xeon Sandy Bridge E5-2670", "number": "2" }
{ "type": "CPU", "model": "Xeon Sandy Bridge E5-2670", "number": "2" }
],
"memory_gib": "64"
},
"System x iDataPlex dx360 M3": {
"id": "dx360m3",
"processors": [
{ "type": "gpu", "model": "Tesla M2090", "number": "2" }
{ "type": "GPU", "model": "Tesla M2090", "number": "2" }
],
"memory_gib": "64"
},
"POWER 730 Express": {
"id": "p730ex",
"processors": [
{ "type": "cpu", "model": "POWER 7", "number": "2" }
{ "type": "CPU", "model": "POWER 7", "number": "2" }
],
"memory_gib": "64"
}
......
This diff is collapsed.
{
"cpu": {
"CPU": {
"Example CPU": {
"id": "example_cpu",
"nb_cores": "1",
"_metadata": {
"clock": "2GHz",
"tdp": "85W",
"vector": "SSE",
"vector_length_bits": "128"
},
"core_attributes": {
"speed": "10.0Gf, 9.0Gf"
},
"core_properties": [
{ "id": "sleep_pstates", "value": "2:3:4"},
{ "id": "watt_per_state", "value": "4.0:8.0, 4.0:7.2, 1.0:1.0, 3.0:3.0, 3.0:3.0" },
{ "id": "watt_off", "value": "2.0" }
]
},
"Xeon Sandy Bridge E5-2670": {
"id": "xsbe52670",
"nb_cores": "8",
......@@ -37,7 +55,23 @@
]
}
},
"gpu": {
"GPU": {
"Example GPU": {
"id": "example_gpu",
"nb_cores": "2",
"_metadata": {
"clock": "500MHz",
"tdp": "120W"
},
"core_attributes": {
"speed": "1.0Gf, 0.9Gf"
},
"core_properties": [
{ "id": "sleep_pstates", "value": "2:3:4"},
{ "id": "watt_per_state", "value": "0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22" },
{ "id": "watt_off", "value": "0.05" }
]
},
"Tesla M2090": {
"id": "teslam2090",
"nb_cores": "512",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment