Last active
July 19, 2019 17:07
-
-
Save heldev/24d4449b935248cca6a226fcf380e548 to your computer and use it in GitHub Desktop.
Adds repositories and corresponding licenses to every line with a link of an awesome list, if it's not a GitHub link the script tries to find GitHub links in the page content
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env stack | |
-- stack script --resolver lts-13.27 | |
-- How to use: | |
-- 1. Install stack `curl -sSL https://get.haskellstack.org/ | sh` | |
-- (alternative ways available here https://docs.haskellstack.org/en/stable/README/#how-to-install ) | |
-- 2. Fill parameters like `gitHubClientId`, `gitHubClientSecret`, `readMeUrl` ... | |
-- (GitHub credentials can be created here https://github.com/settings/applications/new ) | |
-- 3. Run the file `./this-file.hs` | |
-- 4. Review newly created README.md in your working directory | |
-- Possible issues: | |
-- * All licenses are `NA` - double check your gitHub credentials, if they are ok | |
-- wait for an hour or check your API call limits | |
-- * Some urls cause ConnectionTimeout or DNS resolution exceptions - the simplest | |
-- workaround is to add them to `noGoUrls` | |
{-# LANGUAGE OverloadedStrings #-} | |
import qualified Data.ByteString.Lazy as L | |
import qualified Data.ByteString.Lazy.Char8 as C | |
import qualified Data.Map.Strict as Map | |
import qualified Data.Set as Set | |
import Control.Concurrent.Async | |
( mapConcurrently | |
) | |
import Control.Monad | |
( mapM | |
) | |
import Data.Aeson | |
( Object | |
, (.:) | |
) | |
import Data.Aeson.Types | |
( parseMaybe | |
, Parser) | |
import Data.List | |
( (\\) | |
) | |
import Data.Maybe | |
( fromMaybe | |
) | |
import Data.Semigroup | |
( (<>) | |
) | |
import Data.Traversable | |
( sequence | |
) | |
import Debug.Trace | |
( traceIO | |
) | |
import GHC.IO.Handle | |
( Handle | |
) | |
import Network.HTTP.Client | |
( newManager | |
, defaultManagerSettings | |
) | |
import Network.HTTP.Simple | |
( getResponseBody | |
, httpJSON | |
, httpLbs | |
, parseRequest | |
, Request | |
, Response | |
, setRequestHeaders | |
, setRequestQueryString | |
) | |
import System.IO | |
( IOMode | |
( WriteMode | |
) | |
,withFile) | |
import Text.Regex.Base.RegexLike | |
( getAllTextMatches | |
) | |
import Text.Regex.TDFA | |
( (=~) | |
) | |
gitHubClientId = Just "" | |
gitHubClientSecret = Just "" | |
readMeUrl = "https://raw.githubusercontent.com/akullpp/awesome-java/master/README.md" | |
noGoUrls = ["https://awesome.re", "https://manifold.systems", "https://logback.qos.ch", "https://www.slf4j.org"] | |
concurrency = 128 | |
data ListItem = ListItem | |
{ line :: L.ByteString | |
, repositoryToLicense :: Map.Map RepositoryFullName L.ByteString | |
} deriving Show | |
type RepositoryFullName = L.ByteString | |
gitHubLinkRegex = "https?://github.com/([[:alnum:]-]+/[[:alnum:].-]+)" :: L.ByteString | |
main :: IO () | |
main = | |
withFile "README.md" WriteMode writeAwesomeListWithRemarks | |
writeAwesomeListWithRemarks :: Handle -> IO () | |
writeAwesomeListWithRemarks handle = | |
getUrlContent readMeUrl >>= addRemarks >>= L.hPut handle | |
getUrlContent :: String -> IO L.ByteString | |
getUrlContent url = do | |
traceIO $ "Loading " ++ url | |
getResponseBody <$> (httpLbs =<< parseRequest url) | |
addRemarks :: L.ByteString -> IO L.ByteString | |
addRemarks content = do | |
listItemChunks <- sequence $ mapConcurrently toListItem <$> (chunk concurrency $ C.lines content) | |
return $ C.unlines $ render <$> concat listItemChunks | |
toListItem :: L.ByteString -> IO ListItem | |
toListItem line = | |
ListItem line <$> getRepositoriesAndLicenses line | |
getRepositoriesAndLicenses :: L.ByteString -> IO (Map.Map RepositoryFullName L.ByteString) | |
getRepositoriesAndLicenses line = | |
sequence =<< | |
Map.fromSet getLicense <$> Set.fromList <$> getRepositories line | |
getLicense :: RepositoryFullName -> IO L.ByteString | |
getLicense repository = | |
getLicenseKey <$> getRepository repository | |
getRepository :: RepositoryFullName -> IO Object | |
getRepository ownerAndRepo = do | |
request <- prepareRequest $ "https://api.github.com/repos/" <> ownerAndRepo | |
getResponseBody <$> httpJSON request | |
prepareRequest :: L.ByteString -> IO Request | |
prepareRequest url = | |
setRequestHeaders headers | |
<$> setRequestQueryString queryString | |
<$> (parseRequest $ C.unpack url) | |
where | |
headers = | |
[ ("User-Agent", "haskell-script") | |
] | |
queryString = | |
[ ("client_id", gitHubClientId) | |
, ("client_secret", gitHubClientSecret) | |
] | |
getLicenseKey :: Object -> L.ByteString | |
getLicenseKey repository = | |
C.pack $ fromMaybe "NA" $ parseMaybe getLicenseKeyParser repository | |
where | |
getLicenseKeyParser object = | |
object .: "license" >>= (.: "key") | |
getRepositories :: L.ByteString -> IO [RepositoryFullName] | |
getRepositories line | |
| line =~ gitHubLinkRegex = | |
return $ extractRepositories line | |
| otherwise = do | |
pageContents <- mapM (getUrlContent . C.unpack) $ extractUrls line | |
return $ extractRepositories =<< pageContents | |
extractUrls :: L.ByteString -> [L.ByteString] | |
extractUrls content = | |
(getAllTextMatches $ content =~ linkRegex) \\ noGoUrls | |
where | |
linkRegex = "https?://[^)[:space:]]+" :: L.ByteString | |
extractRepositories :: L.ByteString -> [RepositoryFullName] | |
extractRepositories content = | |
concatMap (drop 1) (content =~ gitHubLinkRegex :: [[L.ByteString]]) | |
render :: ListItem -> L.ByteString | |
render listItem = | |
line listItem <> (Map.foldMapWithKey toRemark $ repositoryToLicense listItem) | |
where | |
toRemark key value = " (" <> key <> " " <> value <> ")" | |
chunk :: Int -> [a] -> [[a]] | |
chunk _ [] = [] | |
chunk size list = [take size list] ++ (chunk size $ drop size list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment