|
|
|
%% bare_conf.tex
|
|
%% V1.4b
|
|
%% 2015/08/26
|
|
%% by Michael Shell
|
|
%% See:
|
|
%% http://www.michaelshell.org/
|
|
%% for current contact information.
|
|
%%
|
|
%% This is a skeleton file demonstrating the use of IEEEtran.cls
|
|
%% (requires IEEEtran.cls version 1.8b or later) with an IEEE
|
|
%% conference paper.
|
|
%%
|
|
%% Support sites:
|
|
%% http://www.michaelshell.org/tex/ieeetran/
|
|
%% http://www.ctan.org/pkg/ieeetran
|
|
%% and
|
|
%% http://www.ieee.org/
|
|
|
|
%%*************************************************************************
|
|
%% Legal Notice:
|
|
%% This code is offered as-is without any warranty either expressed or
|
|
%% implied; without even the implied warranty of MERCHANTABILITY or
|
|
%% FITNESS FOR A PARTICULAR PURPOSE!
|
|
%% User assumes all risk.
|
|
%% In no event shall the IEEE or any contributor to this code be liable for
|
|
%% any damages or losses, including, but not limited to, incidental,
|
|
%% consequential, or any other damages, resulting from the use or misuse
|
|
%% of any information contained here.
|
|
%%
|
|
%% All comments are the opinions of their respective authors and are not
|
|
%% necessarily endorsed by the IEEE.
|
|
%%
|
|
%% This work is distributed under the LaTeX Project Public License (LPPL)
|
|
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
|
|
%% distributed and modified. A copy of the LPPL, version 1.3, is included
|
|
%% in the base LaTeX documentation of all distributions of LaTeX released
|
|
%% 2003/12/01 or later.
|
|
%% Retain all contribution notices and credits.
|
|
%% ** Modified files should be clearly indicated as such, including **
|
|
%% ** renaming them and changing author support contact information. **
|
|
%%*************************************************************************
|
|
|
|
|
|
% *** Authors should verify (and, if needed, correct) their LaTeX system ***
|
|
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
|
|
% *** with production work. The IEEE's font choices and paper sizes can ***
|
|
% *** trigger bugs that do not appear when using other class files. *** ***
|
|
% The testflow support page is at:
|
|
% http://www.michaelshell.org/tex/testflow/
|
|
|
|
|
|
|
|
\documentclass[conference]{IEEEtran}
|
|
% Some Computer Society conferences also require the compsoc mode option,
|
|
% but others use the standard conference format.
|
|
%
|
|
% If IEEEtran.cls has not been installed into the LaTeX system files,
|
|
% manually specify the path to it like:
|
|
% \documentclass[conference]{../sty/IEEEtran}
|
|
|
|
|
|
|
|
|
|
|
|
% Some very useful LaTeX packages include:
|
|
% (uncomment the ones you want to load)
|
|
|
|
|
|
% *** MISC UTILITY PACKAGES ***
|
|
%
|
|
%\usepackage{ifpdf}
|
|
% Heiko Oberdiek's ifpdf.sty is very useful if you need conditional
|
|
% compilation based on whether the output is pdf or dvi.
|
|
% usage:
|
|
% \ifpdf
|
|
% % pdf code
|
|
% \else
|
|
% % dvi code
|
|
% \fi
|
|
% The latest version of ifpdf.sty can be obtained from:
|
|
% http://www.ctan.org/pkg/ifpdf
|
|
% Also, note that IEEEtran.cls V1.7 and later provides a builtin
|
|
% \ifCLASSINFOpdf conditional that works the same way.
|
|
% When switching from latex to pdflatex and vice-versa, the compiler may
|
|
% have to be run twice to clear warning/error messages.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
% *** CITATION PACKAGES ***
|
|
%
|
|
%\usepackage{cite}
|
|
% cite.sty was written by Donald Arseneau
|
|
% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
|
|
% \cite{} output to follow that of the IEEE. Loading the cite package will
|
|
% result in citation numbers being automatically sorted and properly
|
|
% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
|
|
% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
|
|
% \cite will automatically add leading space, if needed. Use cite.sty's
|
|
% noadjust option (cite.sty V3.8 and later) if you want to turn this off
|
|
% such as if a citation ever needs to be enclosed in parenthesis.
|
|
% cite.sty is already installed on most LaTeX systems. Be sure and use
|
|
% version 5.0 (2009-03-20) and later if using hyperref.sty.
|
|
% The latest version can be obtained at:
|
|
% http://www.ctan.org/pkg/cite
|
|
% The documentation is contained in the cite.sty file itself.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
% *** GRAPHICS RELATED PACKAGES ***
|
|
%
|
|
\usepackage{subfigure}
|
|
\usepackage{caption}
|
|
\ifCLASSINFOpdf
|
|
\usepackage[pdftex]{graphicx}
|
|
% declare the path(s) where your graphic files are
|
|
% \graphicspath{{../pdf/}{../jpeg/}}
|
|
% and their extensions so you won't have to specify these with
|
|
% every instance of \includegraphics
|
|
% \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
|
|
\else
|
|
% or other class option (dvipsone, dvipdf, if not using dvips). graphicx
|
|
% will default to the driver specified in the system graphics.cfg if no
|
|
% driver is specified.
|
|
\usepackage[dvips]{graphicx}
|
|
% declare the path(s) where your graphic files are
|
|
% \graphicspath{{../eps/}}
|
|
% and their extensions so you won't have to specify these with
|
|
% every instance of \includegraphics
|
|
% \DeclareGraphicsExtensions{.eps}
|
|
\fi
|
|
% graphicx was written by David Carlisle and Sebastian Rahtz. It is
|
|
% required if you want graphics, photos, etc. graphicx.sty is already
|
|
% installed on most LaTeX systems. The latest version and documentation
|
|
% can be obtained at:
|
|
% http://www.ctan.org/pkg/graphicx
|
|
% Another good source of documentation is "Using Imported Graphics in
|
|
% LaTeX2e" by Keith Reckdahl which can be found at:
|
|
% http://www.ctan.org/pkg/epslatex
|
|
%
|
|
% latex, and pdflatex in dvi mode, support graphics in encapsulated
|
|
% postscript (.eps) format. pdflatex in pdf mode supports graphics
|
|
% in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
|
|
% that all non-photo figures use a vector format (.eps, .pdf, .mps) and
|
|
% not a bitmapped formats (.jpeg, .png). The IEEE frowns on bitmapped formats
|
|
% which can result in "jaggedy"/blurry rendering of lines and letters as
|
|
% well as large increases in file sizes.
|
|
%
|
|
% You can find documentation about the pdfTeX application at:
|
|
% http://www.tug.org/applications/pdftex
|
|
|
|
|
|
|
|
|
|
|
|
% *** MATH PACKAGES ***
|
|
%
|
|
%\usepackage{amsmath}
|
|
% A popular package from the American Mathematical Society that provides
|
|
% many useful and powerful commands for dealing with mathematics.
|
|
%
|
|
% Note that the amsmath package sets \interdisplaylinepenalty to 10000
|
|
% thus preventing page breaks from occurring within multiline equations. Use:
|
|
%\interdisplaylinepenalty=2500
|
|
% after loading amsmath to restore such page breaks as IEEEtran.cls normally
|
|
% does. amsmath.sty is already installed on most LaTeX systems. The latest
|
|
% version and documentation can be obtained at:
|
|
% http://www.ctan.org/pkg/amsmath
|
|
|
|
|
|
|
|
|
|
|
|
% *** SPECIALIZED LIST PACKAGES ***
|
|
%
|
|
%\usepackage{algorithmic}
|
|
% algorithmic.sty was written by Peter Williams and Rogerio Brito.
|
|
% This package provides an algorithmic environment fo describing algorithms.
|
|
% You can use the algorithmic environment in-text or within a figure
|
|
% environment to provide for a floating algorithm. Do NOT use the algorithm
|
|
% floating environment provided by algorithm.sty (by the same authors) or
|
|
% algorithm2e.sty (by Christophe Fiorio) as the IEEE does not use dedicated
|
|
% algorithm float types and packages that provide these will not provide
|
|
% correct IEEE style captions. The latest version and documentation of
|
|
% algorithmic.sty can be obtained at:
|
|
% http://www.ctan.org/pkg/algorithms
|
|
% Also of interest may be the (relatively newer and more customizable)
|
|
% algorithmicx.sty package by Szasz Janos:
|
|
% http://www.ctan.org/pkg/algorithmicx
|
|
|
|
|
|
|
|
|
|
% *** ALIGNMENT PACKAGES ***
|
|
%
|
|
%\usepackage{array}
|
|
% Frank Mittelbach's and David Carlisle's array.sty patches and improves
|
|
% the standard LaTeX2e array and tabular environments to provide better
|
|
% appearance and additional user controls. As the default LaTeX2e table
|
|
% generation code is lacking to the point of almost being broken with
|
|
% respect to the quality of the end results, all users are strongly
|
|
% advised to use an enhanced (at the very least that provided by array.sty)
|
|
% set of table tools. array.sty is already installed on most systems. The
|
|
% latest version and documentation can be obtained at:
|
|
% http://www.ctan.org/pkg/array
|
|
|
|
|
|
% IEEEtran contains the IEEEeqnarray family of commands that can be used to
|
|
% generate multiline equations as well as matrices, tables, etc., of high
|
|
% quality.
|
|
|
|
|
|
|
|
|
|
% *** SUBFIGURE PACKAGES ***
|
|
%\ifCLASSOPTIONcompsoc
|
|
% \usepackage[caption=false,font=normalsize,labelfont=sf,textfont=sf]{subfig}
|
|
%\else
|
|
% \usepackage[caption=false,font=footnotesize]{subfig}
|
|
%\fi
|
|
% subfig.sty, written by Steven Douglas Cochran, is the modern replacement
|
|
% for subfigure.sty, the latter of which is no longer maintained and is
|
|
% incompatible with some LaTeX packages including fixltx2e. However,
|
|
% subfig.sty requires and automatically loads Axel Sommerfeldt's caption.sty
|
|
% which will override IEEEtran.cls' handling of captions and this will result
|
|
% in non-IEEE style figure/table captions. To prevent this problem, be sure
|
|
% and invoke subfig.sty's "caption=false" package option (available since
|
|
% subfig.sty version 1.3, 2005/06/28) as this is will preserve IEEEtran.cls
|
|
% handling of captions.
|
|
% Note that the Computer Society format requires a larger sans serif font
|
|
% than the serif footnote size font used in traditional IEEE formatting
|
|
% and thus the need to invoke different subfig.sty package options depending
|
|
% on whether compsoc mode has been enabled.
|
|
%
|
|
% The latest version and documentation of subfig.sty can be obtained at:
|
|
% http://www.ctan.org/pkg/subfig
|
|
|
|
|
|
|
|
|
|
% *** FLOAT PACKAGES ***
|
|
%
|
|
%\usepackage{fixltx2e}
|
|
% fixltx2e, the successor to the earlier fix2col.sty, was written by
|
|
% Frank Mittelbach and David Carlisle. This package corrects a few problems
|
|
% in the LaTeX2e kernel, the most notable of which is that in current
|
|
% LaTeX2e releases, the ordering of single and double column floats is not
|
|
% guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
|
|
% single column figure to be placed prior to an earlier double column
|
|
% figure.
|
|
% Be aware that LaTeX2e kernels dated 2015 and later have fixltx2e.sty's
|
|
% corrections already built into the system in which case a warning will
|
|
% be issued if an attempt is made to load fixltx2e.sty as it is no longer
|
|
% needed.
|
|
% The latest version and documentation can be found at:
|
|
% http://www.ctan.org/pkg/fixltx2e
|
|
|
|
|
|
\usepackage{stfloats}
|
|
% stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
|
|
% the ability to do double column floats at the bottom of the page as well
|
|
% as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
|
|
% LaTeX2e). It also provides a command:
|
|
%\fnbelowfloat
|
|
% to enable the placement of footnotes below bottom floats (the standard
|
|
% LaTeX2e kernel puts them above bottom floats). This is an invasive package
|
|
% which rewrites many portions of the LaTeX2e float routines. It may not work
|
|
% with other packages that modify the LaTeX2e float routines. The latest
|
|
% version and documentation can be obtained at:
|
|
% http://www.ctan.org/pkg/stfloats
|
|
% Do not use the stfloats baselinefloat ability as the IEEE does not allow
|
|
% \baselineskip to stretch. Authors submitting work to the IEEE should note
|
|
% that the IEEE rarely uses double column equations and that authors should try
|
|
% to avoid such use. Do not be tempted to use the cuted.sty or midfloat.sty
|
|
% packages (also by Sigitas Tolusis) as the IEEE does not format its papers in
|
|
% such ways.
|
|
% Do not attempt to use stfloats with fixltx2e as they are incompatible.
|
|
% Instead, use Morten Hogholm'a dblfloatfix which combines the features
|
|
% of both fixltx2e and stfloats:
|
|
%
|
|
% \usepackage{dblfloatfix}
|
|
% The latest version can be found at:
|
|
% http://www.ctan.org/pkg/dblfloatfix
|
|
|
|
|
|
|
|
|
|
% *** PDF, URL AND HYPERLINK PACKAGES ***
|
|
%
|
|
%\usepackage{url}
|
|
% url.sty was written by Donald Arseneau. It provides better support for
|
|
% handling and breaking URLs. url.sty is already installed on most LaTeX
|
|
% systems. The latest version and documentation can be obtained at:
|
|
% http://www.ctan.org/pkg/url
|
|
% Basically, \url{my_url_here}.
|
|
|
|
|
|
|
|
|
|
% *** Do not adjust lengths that control margins, column widths, etc. ***
|
|
% *** Do not use packages that alter fonts (such as pslatex). ***
|
|
% There should be no need to do such things with IEEEtran.cls V1.6 and later.
|
|
% (Unless specifically asked to do so by the journal or conference you plan
|
|
% to submit to, of course. )
|
|
|
|
|
|
% correct bad hyphenation here
|
|
\hyphenation{op-tical net-works semi-conduc-tor}
|
|
|
|
|
|
\begin{document}
|
|
%
|
|
% paper title
|
|
% Titles are generally capitalized except for words such as a, an, and, as,
|
|
% at, but, by, for, in, nor, of, on, or, the, to and up, which are usually
|
|
% not capitalized unless they are the first or last word of the title.
|
|
% Linebreaks \\ can be used within to get better formatting as desired.
|
|
% Do not put math or special symbols in the title.
|
|
\title{Measuring performance on clustering algorithms}
|
|
|
|
|
|
% author names and affiliations
|
|
% use a multiple column layout for up to three different
|
|
% affiliations
|
|
% \author{\IEEEauthorblockN{Michael Shell}
|
|
% \IEEEauthorblockA{School of Electrical and\\Computer Engineering\\
|
|
% Georgia Institute of Technology\\
|
|
% Atlanta, Georgia 30332--0250\\
|
|
% Email: http://www.michaelshell.org/contact.html}
|
|
% \and
|
|
% \IEEEauthorblockN{Homer Simpson}
|
|
% \IEEEauthorblockA{Twentieth Century Fox\\
|
|
% Springfield, USA\\
|
|
% Email: homer@thesimpsons.com}
|
|
% \and
|
|
% \IEEEauthorblockN{James Kirk\\ and Montgomery Scott}
|
|
% \IEEEauthorblockA{Starfleet Academy\\
|
|
% San Francisco, California 96678--2391\\
|
|
% Telephone: (800) 555--1212\\
|
|
% Fax: (888) 555--1212}}
|
|
|
|
% conference papers do not typically use \thanks and this command
|
|
% is locked out in conference mode. If really needed, such as for
|
|
% the acknowledgment of grants, issue a \IEEEoverridecommandlockouts
|
|
% after \documentclass
|
|
|
|
% for over three affiliations, or if they all won't fit within the width
|
|
% of the page, use this alternative format:
|
|
%
|
|
%\author{\IEEEauthorblockN{Michael Shell\IEEEauthorrefmark{1},
|
|
%Homer Simpson\IEEEauthorrefmark{2},
|
|
%James Kirk\IEEEauthorrefmark{3},
|
|
%Montgomery Scott\IEEEauthorrefmark{3} and
|
|
%Eldon Tyrell\IEEEauthorrefmark{4}}
|
|
%\IEEEauthorblockA{\IEEEauthorrefmark{1}School of Electrical and Computer Engineering\\
|
|
%Georgia Institute of Technology,
|
|
%Atlanta, Georgia 30332--0250\\ Email: see http://www.michaelshell.org/contact.html}
|
|
%\IEEEauthorblockA{\IEEEauthorrefmark{2}Twentieth Century Fox, Springfield, USA\\
|
|
%Email: homer@thesimpsons.com}
|
|
%\IEEEauthorblockA{\IEEEauthorrefmark{3}Starfleet Academy, San Francisco, California 96678-2391\\
|
|
%Telephone: (800) 555--1212, Fax: (888) 555--1212}
|
|
%\IEEEauthorblockA{\IEEEauthorrefmark{4}Tyrell Inc., 123 Replicant Street, Los Angeles, California 90210--4321}}
|
|
|
|
\author{\IEEEauthorblockN{
|
|
Petar Kjimov\IEEEauthorrefmark{1},
|
|
Ivica Obadik\IEEEauthorrefmark{2},
|
|
Nadica Nikolova\IEEEauthorrefmark{3} and
|
|
Marija Nikolova\IEEEauthorrefmark{4}}
|
|
\IEEEauthorblockA{Faculty of Computer Science and Engineering\\University of Ss. Cyril and Methodius (UCM),
|
|
Skopje, Macedonia\\
|
|
Email: \IEEEauthorrefmark{1}kjimov.petar@students.finki.ukim.mk,
|
|
\IEEEauthorrefmark{2}obadikj.ivica@students.finki.ukim.mk,\\
|
|
\IEEEauthorrefmark{3}nikolova.nadica@students.finki.ukim.mk,
|
|
\IEEEauthorrefmark{4}nikolova.marija.1@students.finki.ukim.mk}}
|
|
|
|
%\author{\IEEEauthorblockN{Author One\IEEEauthorrefmark{1},
|
|
%Author Two\IEEEauthorrefmark{2}, Author Three\IEEEauthorrefmark{3} and
|
|
%Author Four\IEEEauthorrefmark{4}}
|
|
%\IEEEauthorblockA{Department of Whatever,
|
|
%Whichever University\\
|
|
%Wherever\\
|
|
%Email: \IEEEauthorrefmark{1}author.one@add.on.net,
|
|
%\IEEEauthorrefmark{2}author.two@add.on.net,
|
|
%\IEEEauthorrefmark{3}author.three@add.on.net,
|
|
%\IEEEauthorrefmark{4}author.four@add.on.net}}
|
|
|
|
|
|
% use for special paper notices
|
|
%\IEEEspecialpapernotice{(Invited Paper)}
|
|
|
|
|
|
|
|
|
|
% make the title area
|
|
\maketitle
|
|
|
|
% As a general rule, do not put math, special symbols or citations
|
|
% in the abstract
|
|
\begin{abstract}
|
|
Clustering is a machine learning technique that takes
|
|
a central part in data analysis. It is a powerful technique with
|
|
whom we can group objects based on their characteristics without
|
|
any labels given. Clustering can be used for pattern
|
|
recognition, image processing, bioinformatics and etc. There are
|
|
a lot of clustering algorithms, and which one we use depends on the
|
|
nature of the data we have and on the utility that each output of the
|
|
different clustering algorithms brings to the research. In this paper, we describe testing and evaluation on the performances
|
|
of four different clustering algorithms.
|
|
|
|
The dataset used in this research consists of Facebook events where each event is described by the words in his description. The original dataset contained 130000 instances and
|
|
600000 features. In order to work with this dataset, on the resources available, modifications
|
|
on the default dataset were required such as using sparse matrix formats,
|
|
stemming the description of the events, computing n-grams and
|
|
applying data dimensionality reduction techniques.
|
|
The testing was made on different number of events and features, starting from 100 events
|
|
and 1000 features, ending with 15000 events and 120000 features.
|
|
|
|
The experiment was made using popular implementations of these algorithms in Python Programming Language, and R Programming Language.
|
|
\end{abstract}
|
|
|
|
% no keywords
|
|
\begin{IEEEkeywords}
|
|
Clustering; Facebook events; Machine learning; Dimensionality reduction; Text processing; Natural Language Processing; Performance;Evaluation
|
|
\end{IEEEkeywords}
|
|
|
|
|
|
|
|
% For peer review papers, you can put extra information on the cover
|
|
% page as needed:
|
|
% \ifCLASSOPTIONpeerreview
|
|
% \begin{center} \bfseries EDICS Category: 3-BBND \end{center}
|
|
% \fi
|
|
%
|
|
% For peerreview papers, this IEEEtran command inserts a page break and
|
|
% creates the second title. It will be ignored for other modes.
|
|
\IEEEpeerreviewmaketitle
|
|
|
|
|
|
|
|
\section{Introduction} \label{sec:introduction}
|
|
% no \IEEEPARstart
|
|
% This demo file is intended to serve as a ``starter file''
|
|
% for IEEE conference papers produced under \LaTeX\ using
|
|
% IEEEtran.cls version 1.8b and later.
|
|
% You must have at least 2 lines in the paragraph with the drop letter
|
|
% (should never be an issue)
|
|
|
|
Today, we are aware that there are tons of big data
|
|
that need to be processed for purpose of gathering useful information and insights from that data .
|
|
In order to process that data we need
|
|
resources and techniques that are applicable to it.
|
|
Before we can use the data as input to any algorithm, it is desirable to preprocess the data because it can improve the performance and results
|
|
of the algorithms. Machine learning techniques allow us to manipulate raw data
|
|
in order to retrieve important information.
|
|
|
|
Clustering is one of the most popular techniques that is used for various
|
|
purposes nowadays. With clustering we can process images,
|
|
group patients sick from same disease, to find similarities,
|
|
do market analysis, and etc. There are a lot of different
|
|
groups of algorithms that perform clustering such as
|
|
centroid-based algorithms, hierarchical algorithms,
|
|
density based algorithms and distribution based algorithms.
|
|
|
|
In this paper, we have chosen four clustering algorithms. The algorithms that
|
|
have been chosen are: K-Means\cite{ref:dbscan}, Agglomerative Hierarchical clustering\cite{ref:agglomerativeios} ,
|
|
DBSCAN\cite{ref:dbscan} and BIRCH\cite{ref:zhang1996birch} .
|
|
|
|
Our dataset was generated by quering the Facebook Graph API for events
|
|
that are posted on this social network for certain cities and additionally processing those events to form dataset on which the above algorithms
|
|
can be applied. The detailed procedure is described below.
|
|
|
|
The performance testing and evaluation of the algorithms asserted above, was made with different numbers
|
|
of instances from the dataset with events.
|
|
|
|
The rest of this paper is organized as follow: Section II
|
|
describes briefly the methods we used for preprocessing the events and used clustering
|
|
algorithms. Section III introduces similar analysis on clustering algorithms.
|
|
In Section IV we have briefly described our approach. Section V contains the detailed approach. Results are available in Section VI. Finally Section VII, concludes the paper.
|
|
|
|
\section{Background}
|
|
%short description on this techniques, few rows only
|
|
The problem of clustering Facebook events is equivalent to the problem of clustering small texts. These are very well known problems in machine learning and natural language processing. In this section, we briefly describe the existing techniques that we use to solve this problem.
|
|
|
|
\subsection{Compressed Sparse Row (CSR) Matrix}
|
|
The CSR (Compressed sparse row) matrix format is a format for sparse matrix representation. We use it for the "bag of words" representation which is always sparse.
|
|
|
|
\subsection{Stemming}
|
|
Stemming is the process of reducing the word to its base or root form. For example the words "run" and "running" should represent a single feature. We use the implementation of the "Porter stemmer"\cite{ref:porter1980algorithm} of the NLTK library\cite{ref:nltk_bird2009natural}.
|
|
|
|
\subsection{N-grams}
|
|
N-grams are continious sequence of N items(in our case words) from a given context. With N-grams probabilistic model can be built from context of the items occurence. \cite{ref:n-gram}
|
|
\subsection{Tf-idf}
|
|
Word occurences in a document(in our case event) are normalized by inverse document frequency.\cite{ref:tf-idf}
|
|
|
|
\subsection{Latent Semantic Analysis (LSA)}
|
|
Latent Semantic Analysis is a technique which identifies different contexts of word usage and then groups together the words that are part of the same context. We used LSA to reduce the dimensions of the matrices of events and words (features).\cite{ref:landauer1998introduction}
|
|
|
|
\subsection{K - Means Algorithm}
|
|
The K-Means Algorithm is an unsupervised learning algorithm which places K centroids with random coordinates in space. Each instance is assigned in the group that has the closest centroid.
|
|
When all of the instances are assigned, the positions of the centroids are recalculated, and this is repeated until the centroids no longer move.\cite{ref:kmeans}
|
|
|
|
\subsection{Agglomerative hierarchical clustering}
|
|
The agglomerative hierarchical clustering is a clustering method that starts with assigning each point into a separate cluster. In each step, two closest clusters are merged . This process is repeated until one cluster is left when the algorithm terminates.\cite{ref:agglomerativeios}
|
|
\cite{ref:ward1963hierarchical}
|
|
\subsection{DBSCAN}
|
|
DBSCAN is a density-based clustering algorithm which groups the points in three groups: core, reachable, and noise(outlier) points.
|
|
After assigning each point in a specific group, we take a core point and we assign it to one cluster, along with the other points that are reachable from it.\cite{ref:dbscan_ester1996density}
|
|
\subsection{BIRCH}
|
|
The Birch algorithm is another hierarchical clustering technique commonly used over particularly large data-sets.\cite{ref:zhang1996birch}
|
|
|
|
\subsection{Silhouette}
|
|
Silhouette is a metric for cluster's evaluation whose value is in range betwen -1 and 1. The higher the value, the better separation of clusters is obtained.\cite{ref:silhouettescore}
|
|
|
|
\section{Related work}
|
|
%Related papers here : small section, few rows, references
|
|
There are some papers which are related to ours in some ways.
|
|
The first paper is called "A comparison of document clustering techniques" by M. Steinbach, G. Karypis and V. Kumar.\cite{ref:steinbach2000comparison} In this paper, the authors are comparing K-means and other algorithms for clustering.
|
|
The second paper is called "An Introduction to Latent Semantic Analysis" by
|
|
Thomas K. Landauera, Peter W. Foltzb and Darrell Lahamc.\cite{ref:landauer1998introduction} This paper containes an explanation of the Latent Semantic Analysis, what it is used for, and etc.
|
|
\section{Overview of the solution}
|
|
%Description of the solution in short notes
|
|
Our project is contained from the following steps:
|
|
\begin{enumerate}
|
|
|
|
\item Consuming events for different world locations using the Facebook Graph API
|
|
\item Feature extraction and model creation
|
|
\item Dimensionality reduction
|
|
\item Clustering
|
|
\item \label{step:analyse_evaluate}Analyse the execution time of each algorithm and evaluate its results
|
|
\item Draw conclusions from \ref{step:analyse_evaluate}
|
|
\end{enumerate}
|
|
|
|
|
|
\section{Our solution}
|
|
In this section we describe our test environment and the experiments we performed.
|
|
|
|
\subsection{Test environment}\label{sec:test_environment}
|
|
The Python implementations were executed on a computer with the following configuration: Intel Core i7-4710HQ CPU @ 2.50 GHz with 2 x 8 GB DDR3 RAM @ 1600 MHz. The OS is 64-bit Ubuntu 14.04.3.
|
|
Testing of the algorithms in R programming language were made on computer with 4GB RAM and processor: Intel(R) Core(TM) i3-2350M CPU @ 2.30GHz, running on 64-bit Windows 7 OS. We used the 3.2.3 R version.
|
|
|
|
\subsection{Consuming events and creating dataset}
|
|
We queried Facebook Graph API with HTTP requests
|
|
and obtained name and description for 130 000 events in 3098 world cities.
|
|
Our dataset consists of the descriptions of each event (each description is a sample).
|
|
|
|
\subsection{Feature extraction}\label{subsec:feature_extraction}
|
|
Next we created the "bag of words" model. We tokenized each description to create two models, the first exclusively with 1-grams and the second with 1, 2 and 3-grams as features. We also performed stemming using the "Porter stemmer" algorithm\cite{ref:porter1980algorithm}. The Python version additionally computed tf-idf\cite{ref:tf-idf}. To extract these features we used a combination of algorithms, both from the scikit-learn library\cite{ref:scikit-learn}\cite{ref:sklearn_api}, and some ours implementations too.
|
|
|
|
\subsection{The Compressed Sparse Row (CSR) Format}
|
|
Vectorizing texts to create "bag of words" models, results in a matrices with lot of zeros. In these matrices each column represents a single feature, and each row is a sample described with its feature vector. To store the model from \ref{subsec:feature_extraction} in memory we used the CSR matrix implementations from the SciPy and Bigmemory libraries\cite{ref:scipy_library}\cite{ref:rbigmemory}.
|
|
|
|
\subsection{Dimensionality reduction }
|
|
With the Latent Semantic Analysis(LSA)\cite{ref:lsa_original_paper}, we reduced the number of the features of our model. We tried with 100 and 1000 dimensions. To make visualization possible, the number of dimensions were further reduced to 50 with LSA and even further reduction to 2 dimensions was performed with t-SNE (t-distributed Stochastic Neighbor Embedding)\cite{ref:lsa_original_paper}\cite{ref:rlsafun}\cite{ref:scikit-learn}\cite{ref:sklearn_api}\cite{ref:tsne_van2008visualizing}.
|
|
|
|
\subsection{Running clustering algorithms}
|
|
We experimented with both, the Python and R implementations. In R the algorithms were run on sample from 100 to 1000 instances, with step 100.
|
|
In Python each algorithm was executed on datasets which size ranges from 100 to 3000 events with step 100. We set the number of the desired clusters to be the square root of the size of the input data set (rule of thumb), except for the DBSCAN algorithm which automatically selects the right number. The epsilon parameter in DBSCAN was taken to be 0.2. We used existing implementations for this algorithms, both in Python\cite{ref:scikit-learn}\cite{ref:sklearn_api} and in R\cite{ref:rbiganalytics}\cite{ref:rfastcluster}\cite{ref:rfpc}.
|
|
We did not perform experiments with the BIRCH algorithm in R because its implementations in this language are buggy.
|
|
|
|
\subsection{Measuring running time of algorithms }
|
|
For each algorithm we measured the elapsed time for it's execution. The results from the experiment in R and Python are shown in \ref{fig_ranalysis} respectively. In both languages we measured the time with the highest resolution timer available, which is platform dependent (we listed our test machines in \ref{sec:test_environment}).
|
|
|
|
\subsection{Evaluate and visualize clusters}
|
|
Out metrics for evaluation of the clustering results was silhouete score\cite{ref:silhouettescore} .
|
|
In R silhouette score\cite{ref:silhouettescore} was measured with the stats function from the cluster\cite{ref:rcluster} package and the clusters were visualized with fpc\cite{ref:rfpc} package, by using it's plotcluster function.
|
|
In python silhouette score\cite{ref:silhouettescore} was measured with metrics package from skicit-learn\cite{ref:scikit-learn}\cite{ref:sklearn_api} library. Clusters are visualized with matplotlib\cite{ref:matplotlib} package.
|
|
|
|
\section{Results}
|
|
%Results, analysis, pictures, tables
|
|
|
|
|
|
\subsection{Result analysis in R}
|
|
In figure \ref{fig_ranalysis}, we can see the graphs that show the dependency of silhouette score\cite{ref:silhouettescore} as metric for cluster evaluation (\ref{fig_first_case}) and the dependency of time of execution in seconds towards different sized sample (\ref{fig_second_case}).
|
|
|
|
\begin{figure}[h]
|
|
\subfigure[Cluster evaluation]{%
|
|
\includegraphics[width=3in,natwidth=770,natheight=582]{images/clusteringR1.jpg} \label{fig_first_case}
|
|
}
|
|
\quad
|
|
\subfigure[Time performance]{%
|
|
\includegraphics[width=3in,natwidth=765,natheight=613]{images/clusteringR2.jpg} \label{fig_second_case}
|
|
}
|
|
\caption{Results from execution of clustering algorithms in R}
|
|
\label{fig_ranalysis}
|
|
\end{figure}
|
|
|
|
\subsection{Result analysis in Python}
|
|
|
|
\subsubsection{Cluster evaluation}
|
|
In \ref{fig_first_case} the silhouette coeficient for DBSCAN algorithm is always near -1, which means that with this algorithm, the sample has been assigned to wrong cluster. Thus, the DBSCAN algorithm, is not a good choice for clustering this dataset.
|
|
The values of silhouette score for the hierarchical agglomerative clustering are around the 0, which means that the values from the sample are very close to the bounds of two neighboring clusters. This algorithm has shown better results than the DBSCAN algorithm.
|
|
With the k-means algorithm we can see that the silhouette score starts from 0.24 and goes down to around 0, which means that, also, that the sample is on border of two neighboring clusters\cite{ref:silhouettescore}.
|
|
\subsubsection{Measuring time performance}
|
|
In figure \ref{fig_second_case} we have the dependency of execution time of algorithms and the number of samples. We can immediately spot, that as the sample grows, so as the execution time. We see that, for this dataset, DBSCAN is the slowest algorithm, where as k-means algorithm and the algoritam for agglomerative hierarchical clustering have nearly the same values.
|
|
|
|
|
|
|
|
|
|
% An example of a floating figure using the graphicx package.
|
|
% Note that \label must occur AFTER (or within) \caption.
|
|
% For figures, \caption should occur after the \includegraphics.
|
|
% Note that IEEEtran v1.7 and later has special internal code that
|
|
% is designed to preserve the operation of \label within \caption
|
|
% even when the captionsoff option is in effect. However, because
|
|
% of issues like this, it may be the safest practice to put all your
|
|
% \label just after \caption rather than within \caption{}.
|
|
%
|
|
% Reminder: the "draftcls" or "draftclsnofoot", not "draft", class
|
|
% option should be used if it is desired that the figures are to be
|
|
% displayed while in draft mode.
|
|
%
|
|
%\begin{figure}[!t]
|
|
%\centering
|
|
|
|
%\includegraphics[width=2.5in]{images/hclust100.png}
|
|
% where an .eps filename suffix will be assumed under latex,
|
|
% and a .pdf suffix will be assumed for pdflatex; or what has been declared
|
|
% via \DeclareGraphicsExtensions.
|
|
%\caption{Simulation results for the network.}
|
|
%\label{fig_sim}
|
|
%\end{figure}
|
|
|
|
% Note that the IEEE typically puts floats only at the top, even when this
|
|
% results in a large percentage of a column being occupied by floats.
|
|
|
|
|
|
% An example of a double column floating figure using two subfigures.
|
|
% (The subfig.sty package must be loaded for this to work.)
|
|
% The subfigure \label commands are set within each subfloat command,
|
|
% and the \label for the overall figure must come after \caption.
|
|
% \hfil is used as a separator to get equal spacing.
|
|
% Watch out that the combined width of all the subfigures on a
|
|
% line do not exceed the text width or a line break will occur.
|
|
%
|
|
|
|
% Note that often IEEE papers with subfigures do not employ subfigure
|
|
% captions (using the optional argument to \subfloat[]), but instead will
|
|
% reference/describe all of them (a), (b), etc., within the main caption.
|
|
% Be aware that for subfig.sty to generate the (a), (b), etc., subfigure
|
|
% labels, the optional argument to \subfloat must be present. If a
|
|
% subcaption is not desired, just leave its contents blank,
|
|
% e.g., \subfloat[].
|
|
|
|
|
|
% An example of a floating table. Note that, for IEEE style tables, the
|
|
% \caption command should come BEFORE the table and, given that table
|
|
% captions serve much like titles, are usually capitalized except for words
|
|
% such as a, an, and, as, at, but, by, for, in, nor, of, on, or, the, to
|
|
% and up, which are usually not capitalized unless they are the first or
|
|
% last word of the caption. Table text will default to \footnotesize as
|
|
% the IEEE normally uses this smaller font for tables.
|
|
% The \label must come after \caption as always.
|
|
%
|
|
%\begin{table}[!t]
|
|
%% increase table row spacing, adjust to taste
|
|
%\renewcommand{\arraystretch}{1.3}
|
|
% if using array.sty, it might be a good idea to tweak the value of
|
|
% \extrarowheight as needed to properly center the text within the cells
|
|
%\caption{An Example of a Table}
|
|
%\label{table_example}
|
|
%\centering
|
|
%% Some packages, such as MDW tools, offer better commands for making tables
|
|
%% than the plain LaTeX2e tabular which is used here.
|
|
%\begin{tabular}{|c||c|}
|
|
%\hline
|
|
%One & Two\\
|
|
%\hline
|
|
%Three & Four\\
|
|
%\hline
|
|
%\end{tabular}
|
|
%\end{table}
|
|
|
|
|
|
% Note that the IEEE does not put floats in the very first column
|
|
% - or typically anywhere on the first page for that matter. Also,
|
|
% in-text middle ("here") positioning is typically not used, but it
|
|
% is allowed and encouraged for Computer Society conferences (but
|
|
% not Computer Society journals). Most IEEE journals/conferences use
|
|
% top floats exclusively.
|
|
% Note that, LaTeX2e, unlike IEEE journals/conferences, places
|
|
% footnotes above bottom floats. This can be corrected via the
|
|
% \fnbelowfloat command of the stfloats package.
|
|
|
|
|
|
|
|
|
|
\section{Conclusion}
|
|
In this paper, we measure clustering algorithms in R and Python programming languages. We can conclude from the results that in R, k-means algorithm, and the algoritam for agglomerative hierarchical clustering have better results on cluster evaluation and time performance, than DBSCAN. This means, that, for this data samples, the k-means algorithm and the algorithm for agglomerative hierarchical clustering are better suited for performing clustering.
|
|
|
|
|
|
|
|
|
|
% conference papers do not normally have an appendix
|
|
|
|
|
|
% use section* for acknowledgment
|
|
\section*{Acknowledgment}
|
|
|
|
|
|
The authors would like to thank...
|
|
|
|
|
|
|
|
|
|
|
|
% trigger a \newpage just before the given reference
|
|
% number - used to balance the columns on the last page
|
|
% adjust value as needed - may need to be readjusted if
|
|
% the document is modified later
|
|
%\IEEEtriggeratref{8}
|
|
% The "triggered" command can be changed if desired:
|
|
%\IEEEtriggercmd{\enlargethispage{-5in}}
|
|
|
|
% references section
|
|
|
|
% can use a bibliography generated by BibTeX as a .bbl file
|
|
% BibTeX documentation can be easily obtained at:
|
|
% http://mirror.ctan.org/biblio/bibtex/contrib/doc/
|
|
% The IEEEtran BibTeX style support page is at:
|
|
% http://www.michaelshell.org/tex/ieeetran/bibtex/
|
|
%\bibliographystyle{IEEEtran}
|
|
% argument is your BibTeX string definitions and bibliography database(s)
|
|
%\bibliography{IEEEabrv,../bib/paper}
|
|
%
|
|
% <OR> manually copy in the resultant .bbl file
|
|
% set second argument of \begin to the number of references
|
|
% (used to reserve space for the reference number labels box)
|
|
% \begin{thebibliography}{1}
|
|
|
|
% \bibitem{IEEEhowto:kopka}
|
|
% H.~Kopka and P.~W. Daly, \emph{A Guide to \LaTeX}, 3rd~ed.\hskip 1em plus
|
|
% 0.5em minus 0.4em\relax Harlow, England: Addison-Wesley, 1999.
|
|
|
|
% \end{thebibliography}
|
|
|
|
\bibliographystyle{IEEEtran}
|
|
% argument is your BibTeX string definitions and bibliography database(s)
|
|
\bibliography{IEEEabrv,naucentrud}
|
|
|
|
|
|
% that's all folks
|
|
\end{document}
|
|
|