Vous avez reçu un message "Your GitLab account has been locked ..." ? Pas d'inquiétude : lisez cet article https://docs.gricad-pages.univ-grenoble-alpes.fr/help/unlock/

Commit 08b1b073 authored by Millian Poquet's avatar Millian Poquet
Browse files

Merge branch 'heterogeneous-platform-generator'

parents 679ed19a b0208b1e
Pipeline #16483 failed with stages
in 35 minutes and 17 seconds
# Heterogeneous platforms
## Principle
Batsim allows for the simulation of homogeneous systems with ease. However, one might need to model different kinds of nodes / processors inside a data centre or a cloud environment: distinct memory availability, processor type based scheduling, ...
Even though SimGrid does not support this, it provides user-defined properties which bring flexibility to the simulation.
## Design
Fundamental architecture approach:
![Architecture model](./doc/architecture_model.png)
- **Platform**: root element for SimGrid platform descriptions.
- **Main zone**: top-level zone with *Full* routing; interconnects the master host with the set of clusters.
- **Master zone**: contains and isolates the master host.
- **Master host**: processes communication with the decision system and job submissions.
- **Config zone**: holds user defined properties associated to node and processor types.
- **Clusters**: groups of heterogeneous nodes in the data centre.
- **Router**: provides connectivity with the master host and thus job allocation.
- **Backbone**: intra-cluster communications.
- **Nodes**: computational resources integrating one or more types of processors.
- **Processors**: computing devices, can be CPUs, GPUs, MICs or whichever defined.
- **Cores**: individual computing units inside a processor, homogeneous within the processor.
- **Memory**: amount of storage shared by all processors in the node.
Energy and computational power are expressed per core for fine grain analysis. Cores are SimGrid *hosts*, so they are considered as individual resources.
Network types apply to both the node up / down links, the cluster backbones and the global links in the main zone.
## Requirements
- [LXML](https://pypi.org/project/lxml/) Python package
## Usage
You may define:
- Node types in `node_types.json`
- Processor types in `processor_types.json`
- Network types in `network_types.json`
Create a JSON file with the platform description, `example.json` shows a possible template.
Once integrated, feed it into the Python script for generation:
`./generate_platform.py -p <your-platform.json>`
{
"clusters": [
{
"nodes": [
{ "type": "System x iDataPlex dx360 M4", "number": "158" }
],
"cluster_network": "InfiniBand FDR x4"
},
{ "nodes": [
{ "type": "System x iDataPlex dx360 M3", "number": "5" }
],
"cluster_network": "InfiniBand FDR x4"
},
{ "nodes": [
{ "type": "POWER 730 Express", "number": "10" }
],
"cluster_network": "InfiniBand FDR x4"
}
],
"dc_network": "InfiniBand FDR x4"
}
{
"clusters": [
{
"nodes": [
{ "type": "Example Node", "number": "2" }
],
"cluster_network": "Example Network"
},
{ "nodes": [
{ "type": "Example Node", "number": "1" }
],
"cluster_network": "Example Network"
}
],
"dc_network": "Example Network"
}
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import json
import defusedxml.minidom as xml_format
from defusedxml.lxml import _etree as xml
def main():
"""
Transform an heterogeneous platform description into a valid Batsim XML.
"""
def cmd_args():
"""
Parses command line arguments.
"""
ap = argparse.ArgumentParser(description="Generate Batsim heterogeneous platforms")
ap.add_argument("-p", "--platform-file", type=str, required=True,
help="JSON with platform description")
ap.add_argument("-o", "--output-xml", type=str, default="platform.xml",
help="XML output with platform ready for Batsim")
return ap.parse_args()
def load_data():
"""
Loads data for user's platform, network, node and processor types.
"""
with open(args.platform_file, "r") as platform_f,\
open("network_types.json", "r") as network_types_f,\
open("node_types.json", "r") as node_types_f,\
open("processor_types.json", "r") as processor_types_f:
data = (json.load(platform_f), json.load(network_types_f),
json.load(node_types_f), json.load(processor_types_f))
return data
def generate_tree():
"""
Creates an XML tree complying to SimGrid DTD.
"""
def main_zone():
"""
Contains master zone and all clusters.
"""
return xml.SubElement(platform_xml, "zone",
attrib={"id": "main", "routing": "Full"})
def master_zone():
"""
Hosts the master node which schedules jobs onto resources.
"""
def master_host():
"""
Executes the scheduling algorithms.
"""
xml.SubElement(master_zone_xml, "host",
attrib={"id": "master_host", "speed": "1Gf"})
master_zone_xml = xml.SubElement(main_zone_xml, "zone",
attrib={"id": "master", "routing": "None"})
master_host()
def config_node():
"""
Holds user defined properties concerning node types.
"""
def config_zone():
"""
Define node and proc config properties.
"""
return xml.SubElement(main_zone_xml, "zone",
attrib={"id": "config", "routing": "None"})
return xml.SubElement(config_zone(), "zone",
attrib={"id": "node", "routing": "None"})
def clusters():
"""
Groups of nodes inside the data centre.
"""
def nodes():
"""
Systems available in the data centre, contain processors and other resources (v. gr. memory).
They are connected to a common cluster backbone by up / down links.
"""
def record_node_type():
"""
Inserts the node type in the already configured ones.
"""
if node["type"] not in recorded_nodes:
config_node_type_xml = xml.SubElement(config_node_xml, "zone",
attrib={"id": node["type"], "routing": "None"})
xml.SubElement(config_node_type_xml, "prop",
attrib={"id": "memory", "value": node_template["memory_gib"]})
recorded_nodes[node["type"]] = True
def udlink():
"""
Link between the node and the backbone.
"""
udlink_id = "udl_{}".format(node_id)
_udlink_attrs = {"id": udlink_id, "sharing_policy": "SHARED"}
_udlink_attrs.update(network_types[cluster["cluster_network"]])
xml.SubElement(cluster_xml, "link", attrib=_udlink_attrs)
return udlink_id
def procs():
"""
Computing resources available in the data centre. These can be CPUs, GPUs, MICs, ...
They have a set of cores and power consumption properties.
"""
def cores():
"""
Individual computing units inside a processor.
"""
def core_properties():
"""
Defines node type and power consumption properties.
"""
xml.SubElement(core_xml, "prop",
attrib={"id": "node_type", "value": node["type"]})
for prop in proc_template["core_properties"]:
xml.SubElement(core_xml, "prop", attrib=prop)
def link_association():
"""
Associates up / down link with the core.
"""
xml.SubElement(cluster_xml, "host_link",
attrib={"id": core_id, "up": udlink_id, "down": udlink_id})
for core_idx in range(int(proc_template["nb_cores"])):
core_id = "cor_{}_{}".format(core_idx, proc_id)
_core_attrs = {"id": core_id}
_core_attrs.update(proc_template["core_attributes"])
core_xml = xml.SubElement(cluster_xml, "host", attrib=_core_attrs)
core_properties()
link_association()
for proc in node_template["processors"]:
proc_template = processor_types[proc["type"]][proc["model"]]
for proc_idx in range(int(proc_template["nb_cores"])):
proc_id = "{}_{}_{}".format(proc_template["id"], proc_idx, node_id)
cores()
for node in cluster["nodes"]:
node_template = node_types[node["type"]]
record_node_type()
for node_idx in range(int(node["number"])):
node_id = "{}_{}_{}".format(node_template["id"], node_idx, cluster_id)
udlink_id = udlink()
procs()
def router():
"""
Gateway for inter-cluster connections.
"""
xml.SubElement(cluster_xml, "router",
attrib={"id": "rou_{}".format(cluster_idx)})
def backbone():
"""
Intra-cluster connections.
"""
_backbone_attrs = {"id": "bbo_{}".format(cluster_idx)}
_backbone_attrs.update(network_types[cluster["cluster_network"]])
xml.SubElement(cluster_xml, "backbone", attrib=_backbone_attrs)
cluster_idx = 0
recorded_nodes = {}
for cluster in platform["clusters"]:
cluster_id = "clu_{}".format(cluster_idx)
cluster_xml = xml.SubElement(main_zone_xml, "zone",
attrib={"id": cluster_id, "routing": "Cluster"})
nodes()
router()
backbone()
cluster_idx += 1
def global_links():
"""
Links from clusters to the master zone.
"""
for cluster_idx in range(len(platform["clusters"])):
_global_link_attrs = {"id": "tomh_clu_{}".format(cluster_idx)}
_global_link_attrs.update(network_types[platform["dc_network"]])
xml.SubElement(main_zone_xml, "link", attrib=_global_link_attrs)
def routes():
"""
Routes over global links.
"""
for cluster_idx in range(len(platform["clusters"])):
route_xml = xml.SubElement(main_zone_xml, "zoneRoute",
attrib={"src": "clu_{}".format(cluster_idx), "dst": "master",
"gw_src": "rou_{}".format(cluster_idx), "gw_dst": "master_host"})
xml.SubElement(route_xml, "link_ctn",
attrib={"id": "tomh_clu_{}".format(cluster_idx)})
platform_xml = xml.Element("platform",
attrib={"version": "4.1"})
main_zone_xml = main_zone()
master_zone()
config_node_xml = config_node()
clusters()
global_links()
routes()
return platform_xml
def write_result():
"""
Writes the Batsim formatted platform.
"""
def doctype():
"""
Provides SimGrid doctype.
"""
return "<!DOCTYPE platform SYSTEM \"https://simgrid.org/simgrid.dtd\">"
with open(args.output_xml, "w", ) as output_f:
output_f.write(xml_format.parseString("{}{}".format(doctype(),
xml.tostring(xml_tree).decode())).toprettyxml(indent=" ",
encoding="utf-8").decode())
# Command line arguments
args = cmd_args()
# User's defined platform and type data
platform, network_types, node_types, processor_types = load_data()
# Resulting XML tree
xml_tree = generate_tree()
# Write result to the output file
write_result()
if __name__ == "__main__":
main()
{
"_comments": "Latency values obtained from http://www.crucial.com/usa/en/memory-performance-speed-latency",
"DDR3-1333": {
"latency": "13.50ns"
},
"DDR3-1600": {
"latency": "13.75ns"
},
"DDR4-1866": {
"latency": "13.93ns"
},
"DDR4-2133": {
"latency": "14.06ns"
},
"DDR4-2400": {
"latency": "14.17ns"
},
"DDR4-2666": {
"latency": "13.50ns"
}
}
{
"Example Network": {
"bandwidth": "10.0Gbps"
},
"InfiniBand FDR x4": {
"bandwidth": "54.54Gbps"
},
"Gigabit Ethernet 1000BASE-T": {
"badwidth": "1.0Gbps"
}
}
{
"Superserver SYS-6019P-MT": {
"id": "sys6019p",
"processors": [
{ "type": "Xeon Silver 4114", "number": "2" }
],
"memory": {
"type": "DDR4-2400",
"capacity_gib": "32"
}
}
}
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE platform
SYSTEM 'https://simgrid.org/simgrid.dtd'>
<platform version="4.1">
<zone id="main" routing="Full">
<zone id="master" routing="None">
<host id="master_host" speed="1Gf"/>
</zone>
<zone id="config" routing="None">
<zone id="node" routing="None">
<zone id="Example Node" routing="None">
<prop id="memory" value="128"/>
</zone>
</zone>
</zone>
<zone id="clu_0" routing="Cluster">
<link bandwidth="10.0Gbps" id="udl_example_node_0_clu_0" latency="0.5us" sharing_policy="SHARED"/>
<host id="cor_0_example_cpu_0_example_node_0_clu_0" speed="10.0Gf, 9.0Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="4.0:8.0, 4.0:7.2, 1.0:1.0, 3.0:3.0, 3.0:3.0"/>
<prop id="watt_off" value="2.0"/>
</host>
<host_link down="udl_example_node_0_clu_0" id="cor_0_example_cpu_0_example_node_0_clu_0" up="udl_example_node_0_clu_0"/>
<host id="cor_0_example_gpu_0_example_node_0_clu_0" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_0_clu_0" id="cor_0_example_gpu_0_example_node_0_clu_0" up="udl_example_node_0_clu_0"/>
<host id="cor_1_example_gpu_0_example_node_0_clu_0" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_0_clu_0" id="cor_1_example_gpu_0_example_node_0_clu_0" up="udl_example_node_0_clu_0"/>
<host id="cor_0_example_gpu_1_example_node_0_clu_0" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_0_clu_0" id="cor_0_example_gpu_1_example_node_0_clu_0" up="udl_example_node_0_clu_0"/>
<host id="cor_1_example_gpu_1_example_node_0_clu_0" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_0_clu_0" id="cor_1_example_gpu_1_example_node_0_clu_0" up="udl_example_node_0_clu_0"/>
<link bandwidth="10.0Gbps" id="udl_example_node_1_clu_0" latency="0.5us" sharing_policy="SHARED"/>
<host id="cor_0_example_cpu_0_example_node_1_clu_0" speed="10.0Gf, 9.0Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="4.0:8.0, 4.0:7.2, 1.0:1.0, 3.0:3.0, 3.0:3.0"/>
<prop id="watt_off" value="2.0"/>
</host>
<host_link down="udl_example_node_1_clu_0" id="cor_0_example_cpu_0_example_node_1_clu_0" up="udl_example_node_1_clu_0"/>
<host id="cor_0_example_gpu_0_example_node_1_clu_0" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_1_clu_0" id="cor_0_example_gpu_0_example_node_1_clu_0" up="udl_example_node_1_clu_0"/>
<host id="cor_1_example_gpu_0_example_node_1_clu_0" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_1_clu_0" id="cor_1_example_gpu_0_example_node_1_clu_0" up="udl_example_node_1_clu_0"/>
<host id="cor_0_example_gpu_1_example_node_1_clu_0" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_1_clu_0" id="cor_0_example_gpu_1_example_node_1_clu_0" up="udl_example_node_1_clu_0"/>
<host id="cor_1_example_gpu_1_example_node_1_clu_0" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_1_clu_0" id="cor_1_example_gpu_1_example_node_1_clu_0" up="udl_example_node_1_clu_0"/>
<router id="rou_0"/>
<backbone bandwidth="10.0Gbps" id="bbo_0" latency="0.5us"/>
</zone>
<zone id="clu_1" routing="Cluster">
<link bandwidth="10.0Gbps" id="udl_example_node_0_clu_1" latency="0.5us" sharing_policy="SHARED"/>
<host id="cor_0_example_cpu_0_example_node_0_clu_1" speed="10.0Gf, 9.0Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="4.0:8.0, 4.0:7.2, 1.0:1.0, 3.0:3.0, 3.0:3.0"/>
<prop id="watt_off" value="2.0"/>
</host>
<host_link down="udl_example_node_0_clu_1" id="cor_0_example_cpu_0_example_node_0_clu_1" up="udl_example_node_0_clu_1"/>
<host id="cor_0_example_gpu_0_example_node_0_clu_1" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_0_clu_1" id="cor_0_example_gpu_0_example_node_0_clu_1" up="udl_example_node_0_clu_1"/>
<host id="cor_1_example_gpu_0_example_node_0_clu_1" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_0_clu_1" id="cor_1_example_gpu_0_example_node_0_clu_1" up="udl_example_node_0_clu_1"/>
<host id="cor_0_example_gpu_1_example_node_0_clu_1" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_0_clu_1" id="cor_0_example_gpu_1_example_node_0_clu_1" up="udl_example_node_0_clu_1"/>
<host id="cor_1_example_gpu_1_example_node_0_clu_1" speed="1.0Gf, 0.9Gf">
<prop id="node_type" value="Example Node"/>
<prop id="sleep_pstates" value="2:3:4"/>
<prop id="watt_per_state" value="0.2:0.5, 0.2:0.45, 0.05:0.05, 0.25:0.25, 0.22:0.22"/>
<prop id="watt_off" value="0.05"/>
</host>
<host_link down="udl_example_node_0_clu_1" id="cor_1_example_gpu_1_example_node_0_clu_1" up="udl_example_node_0_clu_1"/>
<router id="rou_1"/>
<backbone bandwidth="10.0Gbps" id="bbo_1" latency="0.5us"/>
</zone>
<link bandwidth="10.0Gbps" id="tomh_clu_0" latency="0.5us"/>
<link bandwidth="10.0Gbps" id="tomh_clu_1" latency="0.5us"/>
<zoneRoute dst="master" gw_dst="master_host" gw_src="rou_0" src="clu_0">
<link_ctn id="tomh_clu_0"/>
</zoneRoute>
<zoneRoute dst="master" gw_dst="master_host" gw_src="rou_1" src="clu_1">
<link_ctn id="tomh_clu_1"/>
</zoneRoute>
</zone>
</platform>
{
"Xeon E5540": {
"_arch": "Nehalem",
"id": "xe5540",
"type": "CPU",
"nb_cores": "4",
"clock_ghz": "2.53",
"tdp_watts": "80",
"dpflops_per_cycle": "4"
},
"Xeon E5-4657L v2": {
"_arch": "Ivy Bridge",
"id": "xe54657Lv2",
"type": "CPU",
"nb_cores": "12",
"clock_ghz": "2.4",
"tdp_watts": "115",
"dpflops_per_cycle": "8"
},
"Xeon E5-4669 v3": {
"_arch": "Haswell",
"id": "xe54669v3",
"type": "CPU",
"nb_cores": "18",
"clock_ghz": "2.1",
"tdp_watts": "135",
"dpflops_per_cycle": "16"
},
"Xeon Silver 4114": {
"_arch": "Skylake",
"id": "xs4114",
"type": "CPU",
"nb_cores": "10",
"clock_ghz": "2.2",
"tdp_watts": "85",
"dpflops_per_cycle": "16"
},
"Xeon Platinum 8180": {
"_arch": "Skylake",
"id": "xp8180",
"type": "CPU",
"nb_cores": "28",
"clock_ghz": "3.8",
"tdp_watts": "205",
"dpflops_per_cycle": "32"
},
"Xeon Phi 7290F": {
"_arch": "Knights Landing",