Source code for stellargraph.datasets.datasets

# -*- coding: utf-8 -*-
#
# Copyright 2019-2020 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
`stellargraph.datasets` contains classes to download sample network datasets.

The default download path of ``stellargraph-datasets`` within the user's home directory can be changed by setting the
``STELLARGRAPH_DATASETS_PATH`` environment variable, and each dataset will be downloaded to a subdirectory within this path.
"""

from .dataset_loader import DatasetLoader
import logging


log = logging.getLogger(__name__)


[docs]class Cora( DatasetLoader, name="Cora", directory_name="cora", url="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz", url_archive_format="gztar", expected_files=["cora.cites", "cora.content"], description="The Cora dataset consists of 2708 scientific publications classified into one of seven classes. " "The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector " "indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words.", source="https://linqs.soe.ucsc.edu/data", ): pass
[docs]class CiteSeer( DatasetLoader, name="CiteSeer", directory_name="citeseer", url="https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz", url_archive_format="gztar", expected_files=["citeseer.cites", "citeseer.content"], description="The CiteSeer dataset consists of 3312 scientific publications classified into one of six classes. " "The citation network consists of 4732 links. Each publication in the dataset is described by a 0/1-valued word vector " "indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 3703 unique words.", source="https://linqs.soe.ucsc.edu/data", ): pass
[docs]class PubMedDiabetes( DatasetLoader, name="PubMed Diabetes", directory_name="Pubmed-Diabetes", url="https://linqs-data.soe.ucsc.edu/public/Pubmed-Diabetes.tgz", url_archive_format="gztar", expected_files=[ "data/Pubmed-Diabetes.DIRECTED.cites.tab", "data/Pubmed-Diabetes.GRAPH.pubmed.tab", "data/Pubmed-Diabetes.NODE.paper.tab", ], description="The PubMed Diabetes dataset consists of 19717 scientific publications from PubMed database " "pertaining to diabetes classified into one of three classes. The citation network consists of 44338 links. " "Each publication in the dataset is described by a TF/IDF weighted word vector from a dictionary which consists of 500 unique words.", source="https://linqs.soe.ucsc.edu/data", data_subdirectory_name="data", ): pass
[docs]class BlogCatalog3( DatasetLoader, name="BlogCatalog3", directory_name="BlogCatalog-dataset", url="http://socialcomputing.asu.edu/uploads/1283153973/BlogCatalog-dataset.zip", url_archive_format="zip", expected_files=[ "data/edges.csv", "data/group-edges.csv", "data/groups.csv", "data/nodes.csv", ], description="This dataset is crawled from a social blog directory website BlogCatalog " "http://www.blogcatalog.com and contains the friendship network crawled and group memberships.", source="http://socialcomputing.asu.edu/datasets/BlogCatalog3", data_subdirectory_name="data", ): pass
[docs]class MovieLens( DatasetLoader, name="MovieLens", directory_name="ml-100k", url="http://files.grouplens.org/datasets/movielens/ml-100k.zip", url_archive_format="zip", expected_files=["u.data", "u.user", "u.item", "u.genre", "u.occupation",], description="The MovieLens 100K dataset contains 100,000 ratings from 943 users on 1682 movies.", source="https://grouplens.org/datasets/movielens/100k/", ): pass
[docs]class AIFB( DatasetLoader, name="AIFB", directory_name="aifb", url="https://ndownloader.figshare.com/files/1118822", url_archive_format=None, expected_files=["aifbfixed_complete.n3",], description="The AIFB dataset describes the AIFB research institute in terms of its staff, research group, and publications. " 'First used for machine learning with RDF in Bloehdorn, Stephan and Sure, York, "Kernel Methods for Mining Instance Data in Ontologies", ' "The Semantic Web (2008), http://dx.doi.org/10.1007/978-3-540-76298-0_5. " "It contains ~8k entities, ~29k edges, and 45 different relationships or edge types. In (Bloehdorn et al 2007) the dataset " "was first used to predict the affiliation (i.e., research group) for people in the dataset. The dataset contains 178 " "members of a research group with 5 different research groups. The goal is to predict which research group a researcher belongs to.", source="https://figshare.com/articles/AIFB_DataSet/745364", ): pass