Commit 5b68b662 authored by foobar's avatar foobar

initial commit

parents
data/*
data/results/*
FROM python:2.7
MAINTAINER sf@psychophoniac.de
ENV DEBIAN_FRONTEND noninteractive
#RUN apt update && apt install -y build-essentials
RUN pip install --upgrade pip
RUN pip install gensim pattern
RUN mkdir /data
ADD data/ /data/
RUN addgroup --system corpus && adduser --system -u 1000 --ingroup corpus corpus
ENV PYTHOINENCODING utf-8
ENV PYTHONPATH /usr/local/lib/python2.7/site-packages
WORKDIR /data
USER corpus
ENTRYPOINT ["/usr/bin/python2.7", "-m", "gensim.scripts.make_wikicorpus", "/data/dewiki-latest-pages-articles.xml.bz2", "/data/results/"]
#!/bin/bash
echo "fetch dewiki dump"
WIKIDATA="https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2"
(cd data/ && wget ${WIKIDATA})
echo "build docker container"
docker build -t wikicorpus --compress .
echo "build docker container"
CONTAINER_HASH=$(docker build -q .)
echo "run this shitshow"
docker run --rm -it --name wikicorpus ${CONTAINER_HASH}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment